示例#1
0
class GenerateApp(App):
    """
  Generate empty documents.
  """
    ndocs_ap = ArgumentParser()
    ndocs_ap.add_argument(
        'ndocs',
        nargs='?',
        metavar='COUNT',
        type=int,
        default=float('inf'),
        help='The number of documents to generate (default: infinity)')
    arg_parsers = (ndocs_ap, OSTREAM_AP)

    def __call__(self):
        empty = io.BytesIO()
        writer = dr.Writer(empty, dr.Doc)
        writer.write(dr.Doc())
        empty = empty.getvalue()

        out = self.args.out_stream
        if six.PY3:
            out = out.buffer
        i = 0
        while i < self.args.ndocs:
            out.write(empty)
            i += 1
示例#2
0
class ListStoresApp(App):
    """
  List the stores available in the corpus.
  Where multiple documents are input, also indicates the number of documents where they appear.
  """
    # Extend to list fields, and fields on stored types
    ls_arg_parser = ArgumentParser()
    ls_arg_parser.add_argument('-e',
                               '--each-doc',
                               dest='show_each',
                               default=False,
                               action='store_true',
                               help='List stores for each doc')
    arg_parsers = (
        ls_arg_parser,
        DESERIALISE_AP,
    )

    def __call__(self):
        counter = collections.defaultdict(int)
        for i, doc in enumerate(self.raw_stream_reader):
            names = list(get_store_names(doc))
            if self.args.show_each:
                print(' '.join(sorted(names)))
            for name in names:
                counter[name] += 1
        try:
            if i == 1:
                fmt = '{name}'
            else:
                fmt = '{name}\t{count}'
        except NameError:
            print("No documents found", out=sys.stderr)
        for k, v in sorted(counter.items(), key=lambda tup: (-tup[1], tup[0])):
            print(fmt.format(name=k, count=v))
示例#3
0
class KFoldsEvaluator(Evaluator):
    """Distribute to each of k folds"""
    ap = ArgumentParser()
    ap.add_argument('kfolds', type=int)
    arg_parsers = (ap, )

    def __call__(self, doc, ind):
        return ind % self.args.kfolds
示例#4
0
class RandomEvaluator(Evaluator):
    """Shuffle the input randomly"""
    ap = ArgumentParser()
    ap.add_argument('--seed', dest='rand_seed', type=int, default=None)
    arg_parsers = (ap, )

    def __init__(self, argparser, args):
        super(RandomEvaluator, self).__init__(argparser, args)
        import random
        self.gen_random = random.Random(self.args.rand_seed).random

    def __call__(self, doc, ind):
        return self.gen_random()
示例#5
0
class RenameApp(App):
    """
  Rename specified fields or stores.
  """
    # TODO: rename annotation classes
    rename_list_ap = ArgumentParser()
    rename_list_ap.add_argument(
        'renames',
        nargs='+',
        type=RenameField,
        help='Rename description of form [Class.]new_name=old_name')
    arg_parsers = (rename_list_ap, ISTREAM_AP, OSTREAM_AP)

    def __init__(self, argparser, args):
        rename_dict = collections.defaultdict(set)
        for klass, new, old in (args.renames or ()):
            rename_dict[klass].add((new, old))
        args.renames = dict(rename_dict)
        super(RenameApp, self).__init__(argparser, args)

    def __call__(self):
        # FIXME: externalise reflection methods
        reader, writer = self.stream_reader_writer
        for doc in reader:
            classes = {None: doc.__class__}
            classes.update((store.klass_name, store._klass)
                           for store in six.itervalues(doc._dr_stores))
            for klass_name, klass in six.iteritems(classes):
                try:
                    renames = self.args.renames[klass_name]
                except KeyError:
                    continue

                relevant = []
                for new, old in renames:
                    try:
                        del klass._dr_s2p[old]
                    except KeyError:
                        pass
                    else:
                        relevant.append((new, old))
                # s2p isn't used in Writer at present, but we'll update it just in case
                klass._dr_s2p.update(relevant)

                fields = klass._dr_fields.copy()
                fields.update(getattr(klass, '_dr_stores', ()))
                for new, old in relevant:
                    fields[old].serial = new

            writer.write(doc)
示例#6
0
class SubsetApp(App):
    """
  Extract documents by non-negative index or slice (a generalisation of head).

  Behaviour is undefined for overlapping slices.
  """
    arg_parser = ArgumentParser()
    arg_parser.add_argument(
        'slices',
        nargs='+',
        type=subset_type,
        help=
        'Non-negative slices in Python-like notation, e.g. 0, 5, :10, 5:10, 5:'
    )
    arg_parsers = (arg_parser, ISTREAM_AP, OSTREAM_AP)

    @staticmethod
    def gen_subsets(it, *slices):
        if not slices:
            for obj in it:
                yield obj
        starts = {sl.start for sl in slices}
        if None in starts:
            starts.add(0)
        stops = {sl.stop for sl in slices}
        if None in stops:
            pairs = enumerate(it)
        else:
            pairs = zip(range(max(stops)), it)

        yielding = False
        for i, obj in pairs:
            yielding = (yielding and i not in stops) or i in starts
            if yielding:
                yield obj

    def _run(self, *slices):
        # TODO: avoid desiralising
        writer = self.raw_stream_writer
        reader = self.raw_stream_reader
        for doc in self.gen_subsets(reader, *slices):
            writer.write(doc)

    def __call__(self):
        self._run(*self.args.slices)
示例#7
0
class SrcGenerator(App):
  """
  Generate source code for declaring types as instantiated in a given corpus, assuming headers are identical throughout.
  """
  srcgen_ap = ArgumentParser()
  add_subparsers(srcgen_ap, sorted(SrcGenLang.CLASSES.items()), 'gen_cls', title='target languages')
  srcgen_ap.add_argument('--doc-name', default='Document', help='The name of the document class (default: %(default)r)')
  srcgen_ap.add_argument('--indent', default='  ', help='The indent text (defaukt: %(default)r)')
  arg_parsers = (srcgen_ap, DESERIALISE_AP, OSTREAM_AP)

  def __init__(self, argparser, args):
    super(SrcGenerator, self).__init__(argparser, args)
    self.generate = args.gen_cls(argparser, args)

  def __call__(self):
    doc = next(self.stream_reader)
    schema = doc._dr_rt.copy_to_schema()  # WARNING: using private
    self.generate(schema)
示例#8
0
class SetFieldApp(App):
    """
  Set a named field on each document to a value.
  """
    field_name_ap = ArgumentParser()
    field_name_ap.add_argument('field_name', help='The field name to set')
    arg_parsers = (field_name_ap, get_evaluator_ap(), DESERIALISE_AP,
                   OSTREAM_AP)

    def __call__(self):
        attr = self.args.field_name
        evaluator = self.evaluator
        reader, writer = self.stream_reader_writer
        for i, doc in enumerate(reader):
            if attr not in doc._dr_s2p:
                # TODO: externalise reflection methods
                doc._dr_s2p[attr] = attr
                doc._dr_fields[attr] = dr.Field(serial=attr)
            setattr(doc, attr, evaluator(doc, i))
            writer.write(doc)
示例#9
0
class WriteConll(App):
    """Writes documents in CONLL format, or a format which similarly lists fields separated by some delimiter.

  Example invocation:
  `cat docs.dr | dr conll --doc-class some.module.Document --norm -f pos --iob1 chunk.tag`
  For `--iob1 'chunk.tag'` to work, this assumes some.module.Document.drcli_decorate includes the following decoration:
    reverse_slices('chunks', 'tokens', 'span', all_attr='chunk')
  """

    annotations_ap = ArgumentParser()
    annotations_ap.add_argument(
        '--tok-store',
        dest='get_tokens',
        default=attrgetter('tokens'),
        type=attrgetter,
        help='Specify a particular Token store (default: tokens)')
    annotations_ap.add_argument(
        '--sent-store',
        dest='get_sentences',
        default=attrgetter('sentences'),
        type=attrgetter,
        help='Specify a particular Sentence store (default: sentences)')
    annotations_ap.add_argument(
        '--sent-tok-slice',
        dest='get_sent_tok_slice',
        default=attrgetter('span'),
        type=attrgetter,
        help=
        'The field on Sentence objects which indicates its slice over tokens (default: span)'
    )
    annotations_ap.add_argument(
        '--ignore-sents',
        dest='get_sentences',
        action='store_const',
        const=lambda doc: (_SuperSentence(), ),
        help='List all tokens as if in a single sentence')

    # TODO: use streams instead of string operations
    formatting_ap = ArgumentParser()
    formatting_ap.add_argument('--field-sep',
                               dest='fmt_fields',
                               default=fmt_separator('\t'),
                               type=fmt_separator,
                               help='Separator between fields (default: tab)')
    formatting_ap.add_argument(
        '--tok-sep',
        dest='fmt_toks',
        default=fmt_separator('\n'),
        type=fmt_separator,
        help='Separator between tokens (default: newline)')
    formatting_ap.add_argument(
        '--sent-sep',
        dest='fmt_sents',
        default=fmt_separator('\n\n'),
        type=fmt_separator,
        help='Separator between sentences (default: double-newline)')
    formatting_ap.add_argument(
        '--doc-sep',
        dest='fmt_docs',
        default=fmt_separator('\n\n#BEGIN-DOC\n\n'),
        type=fmt_separator,
        help='Separator between documents (default: #BEGIN-DOC)')
    formatting_ap.add_argument('--candc',
                               action=SetCandcAction,
                               nargs=0,
                               help='Use default C&C tagger format')

    field_list_ap = ArgumentParser()
    field_list_ap.add_argument('--norm',
                               dest='field_extractors',
                               const=get_norm,
                               action='append_const',
                               help='Output the normal token form')
    field_list_ap.add_argument('--raw',
                               dest='field_extractors',
                               const=get_raw,
                               action='append_const',
                               help='Output the raw token form')
    field_list_ap.add_argument('-f',
                               '--field',
                               dest='field_extractors',
                               type=attrgetter,
                               action='append',
                               help='Output the specified field')
    field_list_ap.add_argument(
        '--fn',
        dest='field_extractors',
        type=import_string,
        action='append',
        help='Output the result of a function given a token')

    # Slice fields:
    field_list_ap.add_argument(
        '--iob1',
        dest='field_extractors',
        action=_AppendSliceField,
        slice_fmt=partial(_IOB, mode=_IOB.IOB1),
        help=
        'Outputs IOB1 given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)'
    )
    field_list_ap.add_argument(
        '--iob2',
        dest='field_extractors',
        action=_AppendSliceField,
        slice_fmt=partial(_IOB, mode=_IOB.IOB2),
        help=
        'Outputs IOB2 given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)'
    )
    field_list_ap.add_argument(
        '--bilou',
        dest='field_extractors',
        action=_AppendSliceField,
        slice_fmt=_BILOU,
        help=
        'Outputs BILOU given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)'
    )
    field_list_ap.add_argument(
        '--bmewo',
        dest='field_extractors',
        action=_AppendSliceField,
        slice_fmt=partial(_BILOU, tags='BMEOW'),
        help=
        'Outputs BMEWO given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)'
    )
    # TODO: allow decorators to be specified on the command-line
    arg_parsers = (field_list_ap, formatting_ap, annotations_ap,
                   DESERIALISE_AP)

    def __init__(self, argparser, args):
        if not args.field_extractors:
            argparser.error('At least one field extractor is required')
        if not hasattr(args, 'clean_field'):
            args.clean_field = lambda s: s
        super(WriteConll, self).__init__(argparser, args)

    def __call__(self):
        self.write_flattened(
            sys.stdout.write,
            self.args.fmt_docs(
                self.process_doc(doc) for doc in self.stream_reader))

    def write_flattened(self, write, iterable):
        for fragment in iterable:
            if isinstance(fragment, six.string_types):
                write(fragment)
            else:
                self.write_flattened(write, fragment)

    def process_doc(self, doc):
        token_store = self.args.get_tokens(doc)
        return self.args.fmt_sents(
            self.begin_sentence() or self.process_sent(sent, token_store)
            for sent in self.args.get_sentences(doc))

    def process_sent(self, sent, tok_store):
        return self.args.fmt_toks(
            self.process_tok(tok)
            for tok in tok_store[self.args.get_sent_tok_slice(sent)])

    def process_tok(self, tok):
        return self.args.fmt_fields(
            self.args.clean_field(str(extr(tok)))
            for extr in self.args.field_extractors)

    def begin_sentence(self):
        # TODO: should only need to do these checks once per instance
        for extr in self.args.field_extractors:
            f = getattr(extr, 'begin_sentence', None)
            if f:
                f()
示例#10
0
class CountApp(App):
    """
  Count the number of documents or annotations in named stores.

  Examples:
    %(prog)s
        # display the number of documents found on standard input
    %(prog)s *.dr
        # list the number of documents in each .dr file and their total
    %(prog)s -a
        # display the number of elements in each store
    %(prog)s -s tokens
        # display the total number of elements in the 'tokens' store
    %(prog)s -ds tokens
        # same with document count
    %(prog)s -ds tokens -s sentences
        # same with number of 'sentences' elements
    %(prog)s -ea
        # display the number of elements in each store per document
    %(prog)s -eac
        # display the cumulative number of elements in each store per document
    %(prog)s -eacj
        # the same with output in JSON rather than a table
    %(prog)s -tcv10
        # every 10 documents, display the time and number of documents processed
    %(prog)s -aj --average --bytes
        # display as JSON the average and total number of bytes consumed by each store
  """
    count_arg_parser = ArgumentParser()
    count_arg_parser.add_argument('-s',
                                  '--store',
                                  metavar='ATTR',
                                  dest='count_stores',
                                  action='append',
                                  default=[],
                                  help='Count the specified store')
    count_arg_parser.add_argument(
        '-d',
        '--docs',
        dest='count_docs',
        action='store_true',
        help='Count the number of documents (default without stores specified)'
    )
    count_arg_parser.add_argument(
        '-a',
        '--all',
        dest='count_all',
        action='store_true',
        help='Count docs and elements in all stores found on the first document'
    )
    count_arg_parser.add_argument('-v',
                                  '--every',
                                  dest='show_interval',
                                  type=int,
                                  metavar='N',
                                  help='Show counts every N docs')
    count_arg_parser.add_argument('-e',
                                  '--every1',
                                  dest='show_interval',
                                  action='store_const',
                                  const=1,
                                  help='Show counts every doc')
    count_arg_parser.add_argument(
        '--bytes',
        dest='count_bytes',
        action='store_true',
        default=False,
        help=
        'Count the number of bytes for each store, rather than the number of elements'
    )
    count_arg_parser.add_argument('--no-subtotal',
                                  dest='show_subtotal',
                                  default=True,
                                  action='store_false',
                                  help='Hides total count per input file')
    count_arg_parser.add_argument(
        '--no-total',
        dest='show_total',
        default=True,
        action='store_false',
        help='Hides total count across all documents')
    count_arg_parser.add_argument('--average',
                                  dest='show_average',
                                  default=False,
                                  action='store_true',
                                  help='Show an average size per document')
    count_arg_parser.add_argument(
        '--no-header',
        dest='show_header',
        default=True,
        action='store_false',
        help=
        'Hides the field names displayed by --fmt-table with more than one field output'
    )
    count_arg_parser.add_argument('-c',
                                  '--cumulative',
                                  default=False,
                                  action='store_true',
                                  help='Show cumulative counts')
    count_arg_parser.add_argument('-t',
                                  '--timestamp',
                                  action='store_true',
                                  default=False,
                                  help='Output the time with each count')
    count_arg_parser.add_argument(
        '--sep',
        dest='field_sep',
        default='\t',
        help='Output field separator (with --fmt-table)')
    count_arg_parser.add_argument('--fmt-table',
                                  dest='formatter_cls',
                                  action='store_const',
                                  const=CountTableFormatter,
                                  default=CountTableFormatter,
                                  help='Format output as a table (default)')
    count_arg_parser.add_argument('-j',
                                  '--fmt-json',
                                  dest='formatter_cls',
                                  action='store_const',
                                  const=CountJsonFormatter,
                                  help='Format output as JSON')
    count_arg_parser.add_argument(
        'files',
        nargs='*',
        type=DrInputType,
        help='Specify files by name rather than standard input')
    arg_parsers = (
        count_arg_parser,
        ISTREAM_AP,
    )

    def __init__(self, argparser, args):
        if args.count_all and (args.count_docs or args.count_stores):
            argparser.error(
                '--all flag may not be used in conjunction with --docs or store names'
            )

        if not (args.count_docs or args.count_stores or args.count_all):
            args.count_docs = True
        if args.count_all:
            args.count_docs = True
        elif 1 == len(args.count_stores) + (1 if args.count_docs else 0):
            args.show_header = False

        if not args.files:
            args.files = [args.in_stream]
        if len(args.files) <= 1:
            args.show_subtotal = False

        if not (args.show_interval or args.show_header or args.show_total
                or args.show_subtotal or args.show_average):
            argparser.error('Nothing to display')

        if args.cumulative and not args.show_interval and not args.show_subtotal:
            argparser.error(
                '--cumulative may not apply without --every or per-file subtotals'
            )

        self.formatter = args.formatter_cls(args, sys.stdout)

        super(CountApp, self).__init__(argparser, args)

    def __call__(self):
        consts = CountFormatter
        unit = consts.COUNT_BYTES if self.args.count_bytes else consts.COUNT_ELEMENTS
        self.formatter.start()

        i = 0
        for in_file in self.args.files:
            if i and not self.args.cumulative:
                subtotals = [0] * len(extractors)
            for doc in read_raw_docs(in_file, on_end='break'):
                if not i:
                    names, extractors = self._get_counters(doc)
                    totals = [0] * len(extractors)
                    subtotals = [0] * len(extractors)
                    self.formatter.set_fields(names)

                doc_counts = [extract(doc) for extract in extractors]
                for j, c in enumerate(doc_counts):
                    subtotals[j] += c
                    totals[j] += c
                if self.args.show_interval and (
                        i + 1) % self.args.show_interval == 0:
                    if self.args.cumulative:
                        self.formatter.add_row(totals,
                                               i,
                                               agg=consts.AGG_SUM,
                                               filename=in_file.name,
                                               unit=unit)
                    else:
                        self.formatter.add_row(doc_counts,
                                               i,
                                               filename=in_file.name,
                                               unit=unit)

                i += 1

            if self.args.show_subtotal:
                try:
                    self.formatter.add_row(subtotals,
                                           consts.FILE,
                                           agg=consts.AGG_SUM,
                                           filename=in_file.name,
                                           unit=unit)
                except NameError:
                    print("No documents to count", file=sys.stderr)

        try:
            if self.args.show_total:
                self.formatter.add_row(totals,
                                       consts.ALL,
                                       agg=consts.AGG_SUM,
                                       unit=unit)
            if self.args.show_average:
                self.formatter.add_row([x / i for x in totals],
                                       consts.ALL,
                                       agg=consts.AGG_AVG,
                                       unit=unit)
        except NameError:
            print("No documents to count", file=sys.stderr)
        self.formatter.finish()

    def _get_counters(self, doc):
        names = []
        extractors = []
        if self.args.count_all:
            self.args.count_stores = sorted(get_store_names(doc))
            if self.args.count_bytes:
                self.args.count_stores.insert(0, b'__meta__')
        else:
            self.args.count_stores = [
                name.encode('utf-8') for name in self.args.count_stores
            ]
        if self.args.count_docs:
            names.append('docs')
            extractors.append(self._doc_counter)
        for store in self.args.count_stores:
            names.append(store.decode('utf-8'))
            extractors.append(self._make_store_counter(store))
        return names, extractors

    @staticmethod
    def _doc_counter(doc):
        return 1

    def _make_store_counter(self, attr):
        if not self.args.count_bytes:

            def count(doc):
                for name, klass, nelem in doc.stores:
                    if name == attr:
                        return nelem
                return 0
        else:
            # TODO: use wire count, relying on Joel's patches to msgpack-python
            def count(doc):
                if attr == '__meta__':
                    return len(msgpack.packb(doc.doc))
                for i, (name, klass, nelem) in enumerate(doc.stores):
                    if name == attr:
                        return len(msgpack.packb(doc.instances[i]))
                return 0

        return count
示例#11
0
class UpgradeVersionApp(App):
    """Upgrade wire format"""

    MAX_VERSION = 3
    ver_ap = ArgumentParser()
    ver_ap.add_argument('-t',
                        '--target',
                        dest='target_version',
                        metavar='VERSION',
                        default=MAX_VERSION,
                        type=int,
                        help='The target version number')
    # TODO: add arguments to save output to input file
    arg_parsers = (ver_ap, ISTREAM_AP, OSTREAM_AP)

    def __call__(self):
        unpacker = msgpack.Unpacker(self.args.in_stream,
                                    use_list=True,
                                    encoding=None)
        out = self.args.out_stream
        if six.PY3:
            out = out.buffer
        while self.process_doc(unpacker, out):
            pass

    def process_doc(self, messages, out):
        try:
            version = next(messages)
        except StopIteration:
            return False
        if not isinstance(version, int):
            # Put the first message back on:
            messages = itertools.chain((version, ), messages)
            version = 1

        for version in range(version, self.args.target_version):
            messages = getattr(self,
                               'update_to_v{0}'.format(version + 1))(messages)

        msgpack.pack(
            self.args.target_version, out,
            use_bin_type=True)  # update functions do not output version
        for msg in messages:
            msgpack.pack(msg, out, use_bin_type=True)

        return True

    def update_to_v2(self, messages):
        """
    Performs the following changes:
    * Replaces is_slice value TRUE with NULL
    * Replaces slice stop from absolute to relative offset
    """
        # TODO: accept options to make certain fields self-pointers
        slice_fields = collections.defaultdict(set)
        meta_klass = None
        try:
            klasses = next(messages)
        except StopIteration as e:
            self._ended_early(self, e)

        for knum, (name, fields) in enumerate(klasses):
            if name == '__meta__':
                meta_klass = knum
            for fnum, fdef in enumerate(fields):
                if fdef.get(FieldType.IS_SLICE):
                    # None is the new True
                    fdef[FieldType.IS_SLICE] = None
                    slice_fields[knum].add(fnum)
        yield klasses  # changed
        del klasses

        try:
            stores = next(messages)
        except StopIteration:
            self._ended_early(self, e)
        yield stores  # unchanged

        for knum in itertools.chain((meta_klass, ),
                                    (k for name, k, size in stores)):
            try:
                nbytes = next(messages)
                instances = next(messages)
            except StopIteration:
                self._ended_early(self, e)

            if knum not in slice_fields:
                # unchanged
                yield nbytes
                yield instances
                continue

            inst_iter = (instances, ) if isinstance(instances,
                                                    dict) else instances
            ksl_fields = slice_fields[knum]
            for instance in inst_iter:
                for f in ksl_fields:
                    val = instance.get(f)
                    if val:
                        instance[f] = (val[0], val[1] - val[0])

            # changed
            yield len(msgpack.packb(instances))
            yield instances

    def _ended_early(self, exc):
        raise ValueError('Messages ended mid-document!')

    def _upgrade_obj_to_v2(self, obj):
        if isinstance(obj, list):
            for i, x in enumerate(obj):
                obj[i] = self._upgrade_obj_to_v2(x)
        elif isinstance(obj, dict):
            new_obj = {}
            for k, v in obj.iteritems():
                new_obj[self._upgrade_obj_to_v2(k)] = self._upgrade_obj_to_v2(
                    v)
            obj = new_obj
        elif isinstance(obj, str):
            try:
                obj = obj.decode('utf-8')
            except UnicodeDecodeError:
                pass
        return obj

    def update_to_v3(self, messages):
        """
    Tries to decode as UTF-8 all values that were the old MessagePack string type. If they
    successfully decode, write them back out as a new MessagePack UTF-8 type; otherwise write them
    out as a new MesagePack bytes type.
    """
        klasses = next(messages)
        assert isinstance(klasses, list)

        stores = next(messages)
        assert isinstance(stores, list)

        doc_instance_nbytes = next(messages)
        assert isinstance(doc_instance_nbytes, int)
        doc_instance = next(messages)
        assert isinstance(doc_instance, dict)

        all_instance_groups = []
        for i in range(len(stores)):
            instance_nbytes = next(messages)
            assert isinstance(instance_nbytes, int)
            instance_groups = next(messages)
            assert isinstance(instance_groups, list)
            all_instance_groups.append(instance_groups)

        klasses = self._upgrade_obj_to_v2(klasses)
        yield klasses

        stores = self._upgrade_obj_to_v2(stores)
        yield stores

        doc_instance = self._upgrade_obj_to_v2(doc_instance)
        yield len(msgpack.packb(doc_instance, use_bin_type=True))
        yield doc_instance

        for instance_groups in all_instance_groups:
            instance_groups = self._upgrade_obj_to_v2(instance_groups)
            yield len(msgpack.packb(instance_groups, use_bin_type=True))
            yield instance_groups
示例#12
0
class SelectApp(App):
    """
  Select only (or remove) specified fields on each document.
  """
    field_list_ap = ArgumentParser()
    field_list_ap.add_argument(
        'fields',
        nargs='+',
        type=SelectField,
        help=
        'Fields or stores to include (or exclude with -x). These are attributes on the document by default. When taking the form Class.field, Class objects will be similarly processed to retain or exclude given fields.'
    )
    field_list_ap.add_argument(
        '-x',
        '--exclude',
        action='store_true',
        default=False,
        help=
        'Treat all fields listed as those to exclude rather than to retain.')
    arg_parsers = (field_list_ap, ISTREAM_AP, OSTREAM_AP)

    def __init__(self, argparser, args):
        field_dict = collections.defaultdict(set)
        for klass, field in (args.fields or ()):
            field_dict[klass].add(field)
        args.doc_fields = field_dict[None]
        args.annot_fields = dict(field_dict)
        if args.exclude:
            self._perform = self._perform_exclude
        else:
            self._perform = self._perform_select
        super(SelectApp, self).__init__(argparser, args)

    def __call__(self):
        # FIXME: externalise reflection methods ... or avoid it by just deleting attributes
        reader, writer = self.stream_reader_writer
        for doc in reader:
            for store in six.itervalues(doc._dr_stores):
                try:
                    fields = self.args.annot_fields[store.klass_name]
                except KeyError:
                    continue
                self._perform(fields, store._klass._dr_s2p,
                              store._klass._dr_fields)

            if self.args.doc_fields:
                self._perform(self.args.doc_fields, doc._dr_s2p,
                              doc._dr_fields, doc._dr_stores)
            writer.write(doc)

    def _perform_exclude(self, fields, *attr_dicts):
        # FIXME: work for non-identity s2p maps, if necessary
        for attr_dict in attr_dicts:
            for f in fields:
                try:
                    del attr_dict[f]
                except KeyError:
                    pass

    def _perform_select(self, fields, *attr_dicts):
        # FIXME: work for non-identity s2p maps, if necessary
        for attr_dict in attr_dicts:
            for f in set(attr_dict) - fields:
                try:
                    del attr_dict[f]
                except KeyError:
                    pass
示例#13
0
class DumpApp(App):
    """
  Debug: unpack the stream and pretty-print it.
  """
    dump_ap = ArgumentParser()
    dump_ap.add_argument(
        '-m',
        '--human',
        dest='human_readable',
        action='store_true',
        default=False,
        help=
        'Reinterpret the messages to be more human-readable by integrating headers into content.'
    )
    dump_ap.add_argument(
        '-n',
        '--numbered',
        action='store_true',
        default=False,
        help=
        'In --human mode, add a \'#\' field to each annotation, indicating its ordinal index'
    )
    dump_ap.add_argument('-d',
                         '--headers',
                         dest='hide_instances',
                         action='store_true',
                         default=False,
                         help='Show headers only, hiding any instances')
    dump_ap.add_argument(
        '-r',
        '--reverse-pointers',
        action='store_true',
        default=False,
        help=
        'Show pointer and slice sources at their target sites, only if --human'
    )
    dump_ap.add_argument('-j',
                         '--json',
                         dest='format',
                         action='store_const',
                         const='json',
                         default='pprint',
                         help='Output valid JSON')
    arg_parsers = (dump_ap, ISTREAM_AP, OSTREAM_AP)

    def dump(self, obj):
        print(self.format(obj), file=self.args.out_stream)

    def __call__(self, encoding='utf-8'):
        if six.PY2 and isinstance(encoding, six.text_type):
            encoding = encoding.encode('utf-8')
        self.format = FORMATTERS[self.args.format]
        unpacker = msgpack.Unpacker(self.args.in_stream, encoding=encoding)
        if self.args.human_readable:
            unpacker = self._integrate_names(unpacker)
        elif self.args.hide_instances:
            unpacker = self._headers_only(unpacker)
        first = True
        for obj in unpacker:
            if self.args.format == 'json':
                print('[' if first else ',', file=self.args.out_stream)
            self.dump(obj)
            first = False
        if self.args.format == 'json':
            print(']')

    def _headers_only(self, unpacker):
        for doc in read_raw_docs(unpacker):
            yield doc.version
            yield doc.klasses
            yield doc.stores

    def _integrate_names(self, unpacker):
        for i, doc in enumerate(read_raw_docs(unpacker)):
            obj = {}
            obj['__version__'] = doc.version
            store_defs = list(self._process_store_defs(doc.stores,
                                                       doc.klasses))
            obj['__meta__'] = {
                'fields':
                dict(
                    self._fields_to_dict(doc.klasses[META_TYPE][1],
                                         store_defs)),
                'item':
                self._process_annot(doc.doc, doc.klasses[META_TYPE][1])
            }
            if self.args.numbered:
                obj['#'] = i
            for (store_name, store), instances in zip(store_defs,
                                                      doc.instances):
                obj[store_name] = store
                if not self.args.hide_instances:
                    store['items'] = [
                        self._process_annot(item, store['fields'])
                        for item in instances
                    ]
                    if self.args.numbered:
                        for j, item in enumerate(store['items']):
                            item['#'] = j
                store['fields'] = dict(
                    self._fields_to_dict(store['fields'], store_defs))

            if self.args.reverse_pointers:
                self._reverse_pointers_with_names(obj)

            yield obj

    def _process_store_defs(self, msg, types):
        for name, typ, size in msg:
            try:
                type_name, type_fields = types[typ]
            except IndexError:
                # for robustness to broken data
                type_name, type_fields = '??MissingType={0}'.format(typ), ()
            yield name, {
                'type': type_name,
                'fields': type_fields,
                'count': size
            }

    def _process_annot(self, msg, fields):
        return dict(
            (fields[fnum][FieldType.NAME], val) for fnum, val in msg.items())

    TRAIT_NAMES = {
        FieldType.IS_SLICE: 'is slice',
        FieldType.IS_SELF_POINTER: 'is self-pointer',
        FieldType.IS_COLLECTION: 'is collection',
    }

    def _fields_to_dict(self, fields, store_defs, trait_names=TRAIT_NAMES):
        for field in fields:
            name = None
            traits = {}
            for k, v in field.items():
                if k == FieldType.NAME:
                    name = v
                elif k == FieldType.POINTER_TO:
                    traits['points to'], store_data = store_defs[v]
                elif k in trait_names:
                    traits[trait_names[k]] = v
                else:
                    traits[k] = v
            yield name, traits

    def _reverse_pointers_with_names(self, obj):
        for source_name, source_store in obj.items():
            if source_name == '__version__':
                continue
            for source_field, source_desc in source_store.get('fields',
                                                              {}).items():
                target_name = source_desc.get('points to')
                if target_name is None:
                    continue

                qual_field = '{}.{}'.format(source_name, source_field)
                target_items = obj[target_name]['items']
                is_slice = 'is slice' in source_desc
                if source_name == '__meta__':
                    source_items = [source_store['item']]
                else:
                    source_items = source_store['items']
                for i, source_item in enumerate(source_items):
                    pointers = source_item.get(source_field)
                    if not pointers:
                        continue
                    if is_slice:
                        for target in target_items[pointers[0]:pointers[0] +
                                                   pointers[1]]:
                            target.setdefault(qual_field, []).append(i)
                    else:
                        if isinstance(pointers, list):
                            for j in pointers:
                                target_items[j].setdefault(qual_field,
                                                           []).append(i)
                        else:
                            target_items[pointers].setdefault(qual_field,
                                                              []).append(i)
示例#14
0
class HackHeaderApp(App):
    """
  Debug: rewrite header components of given documents using Python literal input
  """
    hack_ap = ArgumentParser()
    hack_ap.add_argument(
        '--klasses',
        default=None,
        help='Overwrites the entire klasses header with the given list')
    hack_ap.add_argument(
        '-k',
        '--klass',
        default=[],
        action='append',
        help=
        'Overwrites a klass definition, specified with <name|num>=[<new_name>,<field list>]'
    )
    hack_ap.add_argument(
        '-f',
        '--field',
        default=[],
        action='append',
        help=
        'Overwrites a field definition, specified with <klass-name|num>.<field-name|num>[+]=<map> (use += for update semantics)'
    )
    arg_parsers = (hack_ap, ISTREAM_AP, OSTREAM_AP)

    def __init__(self, argparser, args):
        super(HackHeaderApp, self).__init__(argparser, args)

        def parse(s, exp_type):
            try:
                res = ast.literal_eval(s)
            except (SyntaxError, ValueError):
                argparser.error('{0} is not a valid Python literal'.format(s))
            if exp_type is not None and type(res) != exp_type:
                argparser.error('{0} does not evaluate to type {1}'.format(
                    s, exp_type))
            return res

        self.operations = []

        if args.klasses:
            self.operations.append((self._set_klasses, {
                'value': parse(args.klasses, list)
            }))

        for arg in args.klass:
            try:
                key, value = arg.split('=', 1)
            except ValueError:
                argparser.error('Expected <name>=<value>, got {0}'.format(arg))
            try:
                key = int(key)
            except ValueError:
                pass
            value = parse(value, list)
            if len(value) != 2:
                argparser.error(
                    'Expected a list of length 2, got {0}'.format(value))
            self.operations.append((self._set_klass, {
                'klass': key,
                'value': value
            }))

        for arg in args.field:
            try:
                key, value = arg.split('=', 1)
                kname, fname = key.split('.')
            except ValueError:
                argparser.error(
                    'Expected <kname>.<fname>=<value>, got {0}'.format(arg))
            if fname.endswith('+'):
                fname = fname[:-1]
                update = True
            else:
                update = False
            try:
                kname = int(kname)
            except ValueError:
                pass
            try:
                fname = int(fname)
            except ValueError:
                pass
            value = parse(value, dict)
            self.operations.append((self._set_field, {
                'klass': kname,
                'field': fname,
                'value': value,
                'update': update
            }))

        if not self.operations:
            argparser.error('Nothing to do!')

    def _set_klasses(self, klasses, stores, value):
        klasses[:] = value

    def _set_klass(self, klasses, stores, klass, value):
        if klass == len(klasses):
            klasses.append(value)
        for knum, (kname, fields) in enumerate(klasses):
            if klass in (knum, kname):
                klasses[knum] = value
                return
        raise ValueError('Could not find class {0}'.format(klass))

    def _set_field(self, klasses, stores, klass, field, value, update=False):
        for knum, (kname, fields) in enumerate(klasses):
            if klass not in (knum, kname):
                continue

            if field == len(fields):
                fields.append({})

            for fnum, fdef in enumerate(fields):
                fname = fdef.get(FieldType.NAME)
                if field in (fnum, fname):
                    if update:
                        fields[fnum].update(value)
                    else:
                        fields[fnum] = value
                    return

        raise ValueError('Could not find field {1} in class {0}'.format(
            klass, field))

    def __call__(self):
        writer = self.raw_stream_writer
        for doc in self.raw_stream_reader:
            for fn, kwargs in self.operations:
                fn(doc.klasses, doc.stores, **kwargs)
            writer.write(doc)
示例#15
0
class ShellApp(App):
    """
  Loads the given input file into a Python shell as the variable `docs`

  Examples:
    %(prog)s -c 'for doc in docs: do_something()'
        # executes the given code on `docs` read with automagic from standard input
    %(prog)s -o out.dr -c 'for doc in docs: do_something() and write_doc(doc)'
        # same, writing the documents to out.dr
    %(prog)s path.dr
        # open an interactive Python shell with `docs` read from path.dr with automagic
    %(prog)s --doc-class pkg.module.DocSchema path.dr
        # same, but using the specified schema

  """
    SHELLS = ('ipython', 'bpython', 'python')
    ap = ArgumentParser()
    ap.add_argument(
        '-s',
        '--shell',
        default=None,
        help='One of {0} (default: try these in order)'.format(SHELLS))
    ap.add_argument(
        '--doc-class',
        metavar='CLS',
        dest='doc_class',
        type=import_string,
        default=None,
        help=
        'Import path to the Document class for the input.  If available, doc.{0}() will be called for each document on the stream.'
        .format(DECORATE_METHOD))
    ap.add_argument('-o',
                    '--out-file',
                    type=argparse.FileType('wb'),
                    default=None,
                    help='The output file, written to by `write_doc`')
    ap.add_argument(
        '-c',
        '--code',
        default=None,
        help=
        'Execute the specified code (before opening an interactive session if -i is also used)'
    )
    ap.add_argument('-i',
                    '--interactive',
                    default=False,
                    action='store_true',
                    help='Use an interactive shell even if -c is supplied')
    ap.add_argument('in_file',
                    type=DrInputType,
                    nargs='?',
                    default=None,
                    help='The input file')
    arg_parsers = (ap, )

    def __init__(self, argparser, args):
        args.interactive = args.interactive or args.code is None
        if args.interactive and not args.in_file:
            argparser.error(
                'Cannot read documents from STDIN in interactive mode. Please provide a path to the documents.'
            )
        if not args.in_file:
            import sys
            args.in_file = sys.stdin
        super(ShellApp, self).__init__(argparser, args)

    def __call__(self):
        local = self.build_locals()
        if self.args.code:
            exec(self.args.code,
                 local)  # XXX: this is actually using globals, not locals
        if not self.args.interactive:
            return

        tmp = local
        local = self.run_startup()
        local.update(tmp)

        shells = [self.args.shell] if self.args.shell else self.SHELLS
        for shell in shells:
            try:
                return getattr(self, 'run_' + shell)(local)
            except ImportError as e:
                pass
        raise e

    def build_locals(self):
        res = {'__name__': '__main__'}
        from schwa import dr
        reader, schema = self.get_reader_and_schema(self.args.in_file)
        res.update({'dr': dr, 'docs': reader})
        if self.args.out_file:
            res['write_doc'] = dr.Writer(self.args.out_file, schema).write
        return res

    def run_startup(self):
        res = {'__name__': '__main__'}
        pythonrc = os.environ.get('PYTHONSTARTUP')
        if pythonrc and os.path.isfile(pythonrc):
            with open(pythonrc, 'rU') as f:
                try:
                    exec(f.read(), res)
                except NameError:
                    pass
        try:
            exec('import user', res)
        except ImportError:
            pass
        return res

    def run_ipython(self, local):
        try:
            from IPython.terminal.embed import TerminalInteractiveShell
            shell = TerminalInteractiveShell(user_ns=local)
            shell.mainloop()
        except ImportError:
            # IPython < 0.11
            # Explicitly pass an empty list as arguments, because otherwise
            # IPython would use sys.argv from this script.
            from IPython.Shell import IPShell
            shell = IPShell(argv=[], user_ns=local)
            shell.mainloop()

    def run_bpython(self, local):
        import bpython
        bpython.embed(locals_=local)

    def run_python(self, local):
        import code
        try:
            import readline
        except ImportError:
            pass
        else:
            import rlcompleter
            readline.set_completer(rlcompleter.Completer(local).complete)
            readline.parse_and_bind('tab:complete')
        code.interact(local=local)
示例#16
0
class SplitApp(App):
    """
  Split a stream into k files, or a separate file for each key determined per doc.
  To perform stratified k-fold validation, first sort the corpus by the stratification label.

  If the evaluation returns a list, the document is written to each key in the list.
  """
    multioutput_ap = ArgumentParser()
    multioutput_ap.add_argument(
        '-t',
        '--template',
        dest='path_tpl',
        default='fold{n:03d}.dr',
        help=
        'A template for output paths (default: %(default)s). {n} substitutes for fold number, {key} for evaluation output.'
    )
    multioutput_ap.add_argument(
        '--overwrite',
        action='store_true',
        default=False,
        help='Overwrite an output file if it already exists.')
    multioutput_ap.add_argument(
        '--sparse',
        action='store_true',
        default=False,
        help=
        'Use append mode to write files, and close the handle between writes')
    multioutput_ap.add_argument('--make-dirs',
                                action='store_true',
                                default=False,
                                help='Make directories when necessary')
    arg_parsers = (
        DESERIALISE_AP,
        multioutput_ap,
        get_evaluator_ap({'k': KFoldsEvaluator}),
    )

    def __init__(self, argparser, args):
        if '{' not in args.path_tpl:
            argparser.error(
                'Output path template must include a substitution (e.g. {n:02d} or {key})'
            )
        super(SplitApp, self).__init__(argparser, args)
        if self.args.sparse:
            if self.args.overwrite:
                argparser.error('--overwrite does not apply with --sparse')
            if isinstance(self.evaluator, KFoldsEvaluator):
                argparser.error('k-folds cannot be used with --sparse')
            if any(expr in args.path_tpl
                   for expr in ('{n}', '{n!', '{n:')):  # FIXME: use regexp
                argparser.error('--sparse must use filenames templated by key')

    def __call__(self):
        # TODO: clean up!!
        evaluator = self.evaluator
        if isinstance(evaluator, KFoldsEvaluator):
            # avoid full deserialisation
            # TODO: make more generic
            reader = self.raw_stream_reader
            from drapps.util import RawDocWriter
            make_writer = RawDocWriter
        else:
            reader, schema = self.get_reader_and_schema()
            make_writer = lambda out: dr.Writer(out, schema)

        if self.args.make_dirs:

            def fopen(path, mode):
                dirname = os.path.dirname(path)
                if not os.path.exists(dirname):
                    cur = ''
                    for part in dirname.split(os.path.sep):
                        cur += part
                        if part and not os.path.exists(cur):
                            os.mkdir(cur)
                        cur += os.path.sep
                return open(path, mode)
        else:
            fopen = open

        def new_writer(key):
            fold_num = len(writers)
            path = self.args.path_tpl.format(n=fold_num, key=key)
            if not self.args.overwrite and os.path.exists(path):
                print('Path {0} already exists. Use --overwrite to overwrite.'.
                      format(path),
                      file=sys.stderr)
                sys.exit(1)
            print('Writing fold {k} to {path}'.format(k=fold_num, path=path),
                  file=sys.stderr)
            return make_writer(fopen(path, 'wb'))

        if self.args.sparse:
            get_writer = lambda key: make_writer(
                fopen(self.args.path_tpl.format(key=key), 'ab'))
        else:
            writers = {}

            def get_writer(key):
                try:
                    writer = writers[key]
                except KeyError:
                    writer = writers[key] = new_writer(key)
                return writer

        for i, doc in enumerate(reader):
            val = evaluator(doc, i)
            for key in val if isinstance(val, list) else (val, ):
                writer = get_writer(key)
                writer.write(doc)