Пример #1
0
 def test_iterators(self):
     pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
     shuffle(pairs)
     od = OrderedDict(pairs)
     self.assertEqual(list(od), [t[0] for t in pairs])
     self.assertEqual(list(od.keys()), [t[0] for t in pairs])
     self.assertEqual(list(od.values()), [t[1] for t in pairs])
     self.assertEqual(list(od.items()), pairs)
     self.assertEqual(list(reversed(od)), [t[0] for t in reversed(pairs)])
     self.assertEqual(list(reversed(od.keys())),
                      [t[0] for t in reversed(pairs)])
     self.assertEqual(list(reversed(od.values())),
                      [t[1] for t in reversed(pairs)])
     self.assertEqual(list(reversed(od.items())), list(reversed(pairs)))
Пример #2
0
 def test_iterators(self):
     pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
     shuffle(pairs)
     od = OrderedDict(pairs)
     self.assertEqual(list(od), [t[0] for t in pairs])
     self.assertEqual(list(od.keys()), [t[0] for t in pairs])
     self.assertEqual(list(od.values()), [t[1] for t in pairs])
     self.assertEqual(list(od.items()), pairs)
     self.assertEqual(list(reversed(od)),
                      [t[0] for t in reversed(pairs)])
     self.assertEqual(list(reversed(od.keys())),
                      [t[0] for t in reversed(pairs)])
     self.assertEqual(list(reversed(od.values())),
                      [t[1] for t in reversed(pairs)])
     self.assertEqual(list(reversed(od.items())), list(reversed(pairs)))
Пример #3
0
    def test_init(self):
        with self.assertRaises(TypeError):
            OrderedDict([('a', 1), ('b', 2)], None)  # too many args
        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
        self.assertEqual(sorted(OrderedDict(dict(pairs)).items()),
                         pairs)  # dict input
        self.assertEqual(sorted(OrderedDict(**dict(pairs)).items()),
                         pairs)  # kwds input
        self.assertEqual(list(OrderedDict(pairs).items()),
                         pairs)  # pairs input
        self.assertEqual(
            list(
                OrderedDict([('a', 1), ('b', 2), ('c', 9), ('d', 4)], c=3,
                            e=5).items()), pairs)  # mixed input

        # cyordereddict: remove this test because slot wrappers (on extension
        # types) cannot be inspected
        # make sure no positional args conflict with possible kwdargs
        # self.assertEqual(inspect.getargspec(OrderedDict.__dict__['__init__']).args,
        #                  ['self'])

        # Make sure that direct calls to __init__ do not clear previous contents
        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
        d.__init__([('e', 5), ('f', 6)], g=7, d=4)
        self.assertEqual(list(d.items()), [('a', 1), ('b', 2), ('c', 3),
                                           ('d', 4), ('e', 5), ('f', 6),
                                           ('g', 7)])
Пример #4
0
    def test_init(self):
        with self.assertRaises(TypeError):
            OrderedDict([('a', 1), ('b', 2)], None)  # too many args
        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
        self.assertEqual(sorted(OrderedDict(dict(pairs)).items()),
                         pairs)  # dict input
        self.assertEqual(sorted(OrderedDict(**dict(pairs)).items()),
                         pairs)  # kwds input
        self.assertEqual(list(OrderedDict(pairs).items()),
                         pairs)  # pairs input
        self.assertEqual(
            list(
                OrderedDict([('a', 1), ('b', 2), ('c', 9), ('d', 4)], c=3,
                            e=5).items()), pairs)  # mixed input

        # make sure no positional args conflict with possible kwdargs
        self.assertEqual(list(OrderedDict(self=42).items()), [('self', 42)])
        self.assertEqual(list(OrderedDict(other=42).items()), [('other', 42)])
        self.assertRaises(TypeError, OrderedDict, 42)
        self.assertRaises(TypeError, OrderedDict, (), ())
        self.assertRaises(TypeError, OrderedDict.__init__)

        # Make sure that direct calls to __init__ do not clear previous contents
        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
        d.__init__([('e', 5), ('f', 6)], g=7, d=4)
        self.assertEqual(list(d.items()), [('a', 1), ('b', 2), ('c', 3),
                                           ('d', 4), ('e', 5), ('f', 6),
                                           ('g', 7)])
Пример #5
0
    def test_setdefault(self):
        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
        shuffle(pairs)
        od = OrderedDict(pairs)
        pair_order = list(od.items())
        self.assertEqual(od.setdefault('a', 10), 3)
        # make sure order didn't change
        self.assertEqual(list(od.items()), pair_order)
        self.assertEqual(od.setdefault('x', 10), 10)
        # make sure 'x' is added to the end
        self.assertEqual(list(od.items())[-1], ('x', 10))

        # make sure setdefault still works when __missing__ is defined
        class Missing(OrderedDict):
            def __missing__(self, key):
                return 0
        self.assertEqual(Missing().setdefault(5, 9), 9)
Пример #6
0
 def test_delitem(self):
     pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
     od = OrderedDict(pairs)
     del od['a']
     self.assertNotIn('a', od)
     with self.assertRaises(KeyError):
         del od['a']
     self.assertEqual(list(od.items()), pairs[:2] + pairs[3:])
Пример #7
0
 def test_delitem(self):
     pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
     od = OrderedDict(pairs)
     del od['a']
     self.assertNotIn('a', od)
     with self.assertRaises(KeyError):
         del od['a']
     self.assertEqual(list(od.items()), pairs[:2] + pairs[3:])
Пример #8
0
    def test_setdefault(self):
        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
        shuffle(pairs)
        od = OrderedDict(pairs)
        pair_order = list(od.items())
        self.assertEqual(od.setdefault('a', 10), 3)
        # make sure order didn't change
        self.assertEqual(list(od.items()), pair_order)
        self.assertEqual(od.setdefault('x', 10), 10)
        # make sure 'x' is added to the end
        self.assertEqual(list(od.items())[-1], ('x', 10))

        # make sure setdefault still works when __missing__ is defined
        class Missing(OrderedDict):
            def __missing__(self, key):
                return 0

        self.assertEqual(Missing().setdefault(5, 9), 9)
Пример #9
0
 def test_reinsert(self):
     # Given insert a, insert b, delete a, re-insert a,
     # verify that a is now later than b.
     od = OrderedDict()
     od['a'] = 1
     od['b'] = 2
     del od['a']
     od['a'] = 1
     self.assertEqual(list(od.items()), [('b', 2), ('a', 1)])
Пример #10
0
 def test_reinsert(self):
     # Given insert a, insert b, delete a, re-insert a,
     # verify that a is now later than b.
     od = OrderedDict()
     od['a'] = 1
     od['b'] = 2
     del od['a']
     od['a'] = 1
     self.assertEqual(list(od.items()), [('b', 2), ('a', 1)])
Пример #11
0
 def _stats(self):
     _stats = OrderedDict()
     _stats['id_string'] = self.id_string
     _stats['versions'] = len(self.versions)
     # _stats['submissions'] = self.submissions_count()
     _stats['row_count'] = len(self[-1].schema.get('content', {})
                                              .get('survey', []))
     # returns stats in the format [ key="value" ]
     return '\n\t'.join('%s="%s"' % item for item in _stats.items())
Пример #12
0
 def _stats(self):
     _stats = OrderedDict()
     _stats['id_string'] = self.id_string
     _stats['versions'] = len(self.versions)
     # _stats['submissions'] = self.submissions_count()
     _stats['row_count'] = len(self[-1].schema.get('content',
                                                   {}).get('survey', []))
     # returns stats in the format [ key="value" ]
     return '\n\t'.join('%s="%s"' % item for item in _stats.items())
Пример #13
0
class NegraCorpusReader(CorpusReader):
    """Read a corpus in the Negra export format."""

    def blocks(self):
        if self._block_cache is None:
            self._block_cache = OrderedDict(self._read_blocks())
        return OrderedDict((a, '\n'.join(b) + '\n')
                           for a, b in self._block_cache.items())

    def _read_blocks(self):
        """Read corpus and yield blocks corresponding to each sentence."""
        results = set()
        started = False
        for filename in self._filenames:
            with openread(filename, encoding=self._encoding) as inp:
                for line in inp:
                    if line.startswith('#BOS '):
                        if started:
                            raise ValueError('beginning of sentence marker '
                                             'while previous one still open: %s' % line)
                        started = True
                        line = line.strip()
                        sentid = line.split()[1]
                        lines = [line]
                    elif line.startswith('#EOS '):
                        if not started:
                            raise ValueError('end of sentence marker while '
                                             'none started')
                        thissentid = line.strip().split()[1]
                        if sentid != thissentid:
                            raise ValueError('unexpected sentence id: '
                                             'start=%s, end=%s' % (sentid, thissentid))
                        started = False
                        if sentid in results:
                            raise ValueError('duplicate sentence ID: %s' % sentid)
                        results.add(sentid)
                        lines.append(line.strip())
                        yield sentid, lines
                    elif started:
                        lines.append(line.strip())
                    # other lines are ignored: #FORMAT x, %% comments, ...

    def _parse(self, block):
        return exporttree(block, self.functions, self.morphology, self.lemmas)
Пример #14
0
    def test_update(self):
        with self.assertRaises(TypeError):
            OrderedDict().update([('a', 1), ('b', 2)], None)  # too many args
        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
        od = OrderedDict()
        od.update(dict(pairs))
        self.assertEqual(sorted(od.items()), pairs)  # dict input
        od = OrderedDict()
        od.update(**dict(pairs))
        self.assertEqual(sorted(od.items()), pairs)  # kwds input
        od = OrderedDict()
        od.update(pairs)
        self.assertEqual(list(od.items()), pairs)  # pairs input
        od = OrderedDict()
        od.update([('a', 1), ('b', 2), ('c', 9), ('d', 4)], c=3, e=5)
        self.assertEqual(list(od.items()), pairs)  # mixed input

        # Issue 9137: Named argument called 'other' or 'self'
        # shouldn't be treated specially.
        od = OrderedDict()
        od.update(self=23)
        self.assertEqual(list(od.items()), [('self', 23)])
        od = OrderedDict()
        od.update(other={})
        self.assertEqual(list(od.items()), [('other', {})])
        od = OrderedDict()
        od.update(red=5, blue=6, other=7, self=8)
        self.assertEqual(sorted(list(od.items())), [('blue', 6), ('other', 7),
                                                    ('red', 5), ('self', 8)])

        # Make sure that direct calls to update do not clear previous contents
        # add that updates items are not moved to the end
        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
        d.update([('e', 5), ('f', 6)], g=7, d=4)
        self.assertEqual(list(d.items()), [('a', 1), ('b', 2), ('c', 3),
                                           ('d', 4), ('e', 5), ('f', 6),
                                           ('g', 7)])

        self.assertRaises(TypeError, OrderedDict().update, 42)
        self.assertRaises(TypeError, OrderedDict().update, (), ())
        self.assertRaises(TypeError, OrderedDict.update)
Пример #15
0
class NegraCorpusReader(CorpusReader):
	"""Read a corpus in the Negra export format."""

	def blocks(self):
		if self._block_cache is None:
			self._block_cache = OrderedDict(self._read_blocks())
		return OrderedDict((a, '\n'.join(b) + '\n')
				for a, b in self._block_cache.items())

	def _read_blocks(self):
		"""Read corpus and yield blocks corresponding to each sentence."""
		results = set()
		started = False
		for filename in self._filenames:
			for line in openread(filename, encoding=self._encoding):
				if line.startswith('#BOS '):
					if started:
						raise ValueError('beginning of sentence marker while '
								'previous one still open: %s' % line)
					started = True
					line = line.strip()
					sentid = line.split()[1]
					lines = [line]
				elif line.startswith('#EOS '):
					if not started:
						raise ValueError('end of sentence marker while '
								'none started')
					thissentid = line.strip().split()[1]
					if sentid != thissentid:
						raise ValueError('unexpected sentence id: '
							'start=%s, end=%s' % (sentid, thissentid))
					started = False
					if sentid in results:
						raise ValueError('duplicate sentence ID: %s' % sentid)
					results.add(sentid)
					lines.append(line.strip())
					yield sentid, lines
				elif started:
					lines.append(line.strip())
				# other lines are ignored, such as #FORMAT x, %% comments, ...

	def _parse(self, block):
		return exporttree(block, self.functions, self.morphology, self.lemmas)
Пример #16
0
    def test_update(self):
        with self.assertRaises(TypeError):
            OrderedDict().update([('a', 1), ('b', 2)], None)                        # too many args
        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
        od = OrderedDict()
        od.update(dict(pairs))
        self.assertEqual(sorted(od.items()), pairs)                                 # dict input
        od = OrderedDict()
        od.update(**dict(pairs))
        self.assertEqual(sorted(od.items()), pairs)                                 # kwds input
        od = OrderedDict()
        od.update(pairs)
        self.assertEqual(list(od.items()), pairs)                                   # pairs input
        od = OrderedDict()
        od.update([('a', 1), ('b', 2), ('c', 9), ('d', 4)], c=3, e=5)
        self.assertEqual(list(od.items()), pairs)                                   # mixed input

        # Issue 9137: Named argument called 'other' or 'self'
        # shouldn't be treated specially.
        od = OrderedDict()
        od.update(self=23)
        self.assertEqual(list(od.items()), [('self', 23)])
        od = OrderedDict()
        od.update(other={})
        self.assertEqual(list(od.items()), [('other', {})])
        od = OrderedDict()
        od.update(red=5, blue=6, other=7, self=8)
        self.assertEqual(sorted(list(od.items())),
                         [('blue', 6), ('other', 7), ('red', 5), ('self', 8)])

        # Make sure that direct calls to update do not clear previous contents
        # add that updates items are not moved to the end
        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
        d.update([('e', 5), ('f', 6)], g=7, d=4)
        self.assertEqual(list(d.items()),
            [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7)])

        self.assertRaises(TypeError, OrderedDict().update, 42)
        self.assertRaises(TypeError, OrderedDict().update, (), ())
        self.assertRaises(TypeError, OrderedDict.update)
Пример #17
0
    def test_init(self):
        with self.assertRaises(TypeError):
            OrderedDict([('a', 1), ('b', 2)], None)                                 # too many args
        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
        self.assertEqual(sorted(OrderedDict(dict(pairs)).items()), pairs)           # dict input
        self.assertEqual(sorted(OrderedDict(**dict(pairs)).items()), pairs)         # kwds input
        self.assertEqual(list(OrderedDict(pairs).items()), pairs)                   # pairs input
        self.assertEqual(list(OrderedDict([('a', 1), ('b', 2), ('c', 9), ('d', 4)],
                                          c=3, e=5).items()), pairs)                # mixed input

        # cyordereddict: remove this test because slot wrappers (on extension
        # types) cannot be inspected
        # make sure no positional args conflict with possible kwdargs
        # self.assertEqual(inspect.getargspec(OrderedDict.__dict__['__init__']).args,
        #                  ['self'])

        # Make sure that direct calls to __init__ do not clear previous contents
        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
        d.__init__([('e', 5), ('f', 6)], g=7, d=4)
        self.assertEqual(list(d.items()),
            [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7)])
Пример #18
0
    def test_init(self):
        with self.assertRaises(TypeError):
            OrderedDict([('a', 1), ('b', 2)], None)                                 # too many args
        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
        self.assertEqual(sorted(OrderedDict(dict(pairs)).items()), pairs)           # dict input
        self.assertEqual(sorted(OrderedDict(**dict(pairs)).items()), pairs)         # kwds input
        self.assertEqual(list(OrderedDict(pairs).items()), pairs)                   # pairs input
        self.assertEqual(list(OrderedDict([('a', 1), ('b', 2), ('c', 9), ('d', 4)],
                                          c=3, e=5).items()), pairs)                # mixed input

        # make sure no positional args conflict with possible kwdargs
        self.assertEqual(list(OrderedDict(self=42).items()), [('self', 42)])
        self.assertEqual(list(OrderedDict(other=42).items()), [('other', 42)])
        self.assertRaises(TypeError, OrderedDict, 42)
        self.assertRaises(TypeError, OrderedDict, (), ())
        self.assertRaises(TypeError, OrderedDict.__init__)

        # Make sure that direct calls to __init__ do not clear previous contents
        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
        d.__init__([('e', 5), ('f', 6)], g=7, d=4)
        self.assertEqual(list(d.items()),
            [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7)])
Пример #19
0
def read_config(config_file, default_config=None):
    """
    This function is from tonic (author: Joe Hamman)
    Return a dictionary with subdictionaries of all configFile options/values
    """

    from netCDF4 import Dataset
    try:
        from cyordereddict import OrderedDict
    except:
        from collections import OrderedDict
    try:
        from configparser import SafeConfigParser
    except:
        from ConfigParser import SafeConfigParser
    import configobj

    config = SafeConfigParser()
    config.optionxform = str
    config.read(config_file)
    sections = config.sections()
    dict1 = OrderedDict()
    for section in sections:
        options = config.options(section)
        dict2 = OrderedDict()
        for option in options:
            dict2[option] = config_type(config.get(section, option))
        dict1[section] = dict2

    if default_config is not None:
        for name, section in dict1.items():
            if name in default_config.keys():
                for option, key in default_config[name].items():
                    if option not in section.keys():
                        dict1[name][option] = key

    return dict1
Пример #20
0
def read_config(config_file, default_config=None):
    """
    This function is from tonic (author: Joe Hamman)
    Return a dictionary with subdictionaries of all configFile options/values
    """

    from netCDF4 import Dataset
    try:
        from cyordereddict import OrderedDict
    except:
        from collections import OrderedDict
    try:
        from configparser import SafeConfigParser
    except:
        from ConfigParser import SafeConfigParser
    import configobj

    config = SafeConfigParser()
    config.optionxform = str
    config.read(config_file)
    sections = config.sections()
    dict1 = OrderedDict()
    for section in sections:
        options = config.options(section)
        dict2 = OrderedDict()
        for option in options:
            dict2[option] = config_type(config.get(section, option))
        dict1[section] = dict2

    if default_config is not None:
        for name, section in dict1.items():
            if name in default_config.keys():
                for option, key in default_config[name].items():
                    if option not in section.keys():
                        dict1[name][option] = key

    return dict1
Пример #21
0
 def test_copying(self):
     # Check that ordered dicts are copyable, deepcopyable, picklable,
     # and have a repr/eval round-trip
     pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
     od = OrderedDict(pairs)
     update_test = OrderedDict()
     update_test.update(od)
     for i, dup in enumerate([
                 od.copy(),
                 copy.copy(od),
                 copy.deepcopy(od),
                 pickle.loads(pickle.dumps(od, 0)),
                 pickle.loads(pickle.dumps(od, 1)),
                 pickle.loads(pickle.dumps(od, 2)),
                 pickle.loads(pickle.dumps(od, -1)),
                 eval(repr(od)),
                 update_test,
                 OrderedDict(od),
                 ]):
         self.assertTrue(dup is not od)
         self.assertEqual(dup, od)
         self.assertEqual(list(dup.items()), list(od.items()))
         self.assertEqual(len(dup), len(od))
         self.assertEqual(type(dup), type(od))
Пример #22
0
 def test_copying(self):
     # Check that ordered dicts are copyable, deepcopyable, picklable,
     # and have a repr/eval round-trip
     pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
     od = OrderedDict(pairs)
     update_test = OrderedDict()
     update_test.update(od)
     for i, dup in enumerate([
             od.copy(),
             copy.copy(od),
             copy.deepcopy(od),
             pickle.loads(pickle.dumps(od, 0)),
             pickle.loads(pickle.dumps(od, 1)),
             pickle.loads(pickle.dumps(od, 2)),
             pickle.loads(pickle.dumps(od, -1)),
             eval(repr(od)),
             update_test,
             OrderedDict(od),
     ]):
         self.assertTrue(dup is not od)
         self.assertEqual(dup, od)
         self.assertEqual(list(dup.items()), list(od.items()))
         self.assertEqual(len(dup), len(od))
         self.assertEqual(type(dup), type(od))
Пример #23
0
class FormVersion(object):
    @classmethod
    def verify_schema_structure(cls, struct):
        if 'content' not in struct:
            raise SchemaError('version content must have "content"')
        if 'survey' not in struct['content']:
            raise SchemaError('version content must have "survey"')
        validate_content(struct['content'])

    # QUESTION FOR ALEX: get rid off _root_node_name ? What is it for ?
    def __init__(self, form_pack, schema):

        # QUESTION FOR ALEX: why this check ?
        if 'name' in schema:
            raise ValueError('FormVersion should not have a name parameter. '
                             'consider using "title" or "id_string"')
        self.schema = schema
        self.form_pack = form_pack

        # slug of title
        self.root_node_name = self._get_root_node_name()

        # form version id, unique to this version of the form
        self.id = schema.get('version')
        self.version_id_key = schema.get('version_id_key',
                                         form_pack.default_version_id_key)

        # form string id, unique to this form, shared accross versions
        self.id_string = schema.get('id_string')

        # TODO: set the title of the last version as the name of the first
        # section ?
        # Human readable title for this version
        self.title = schema.get('title', form_pack.title)

        # List of available language for translation. One translation does
        # not mean all labels are translated, but at least one.
        # One special translation not listed here is "_default", which
        # use either the only label available, or the field name.
        # This will be converted down the line to a list. We use an OrderedDict
        # to maintain order and remove duplicates, but will need indexing later
        self.translations = OrderedDict()

        # Sections separates fields from various level of nesting in case
        # we have repeat group. If you don't have repeat group, you have
        # only one section, if you have repeat groups, you will have one
        # section per repeat group. Sections eventually become sheets in
        # xls export.
        self.sections = OrderedDict()

        content = self.schema['content']

        self.translations = map(lambda t: t if t is not None else UNTRANSLATED,
                                content.get('translations', [None]))

        # TODO: put those parts in a separate method and unit test it
        survey = content.get('survey', [])
        fields_by_name = dict(map(lambda row: (row.get('name'), row), survey))

        # Analyze the survey schema and extract the informations we need
        # to build the export: the sections, the choices, the fields
        # and translations for each of them.

        # Extract choices data.
        # Choices are the list of values you can choose from to answer a
        # specific question. They can have translatable labels.
        choices_definition = content.get('choices', ())
        field_choices = FormChoice.all_from_json_definition(choices_definition,
                                                            self.translations)

        # Extract fields data
        group = None
        section = FormSection(name=form_pack.title)
        self.sections[form_pack.title] = section

        # Those will keep track of were we are while traversing the
        # schema.
        # Hierarchy contains all the levels, mixing groups and sections,
        # including the first and last ones while stacks are just an history of
        # previous levels, and for either groups or sections.
        hierarchy = [section]
        group_stack = []
        section_stack = []

        for data_definition in survey:
            data_type = data_definition.get('type')
            if not data_type: # handle broken data type definition
                continue

            data_type = normalize_data_type(data_type)
            name = data_definition.get('name')

            # parse closing groups and repeat
            if data_type is None:
                continue

            if data_type == 'end_group':
                # We go up in one level of nesting, so we set the current group
                # to be what used to be the parent group. We also remote one
                # level in the hierarchy.
                hierarchy.pop()
                group = group_stack.pop()
                continue

            if data_type == 'end_repeat':
                # We go up in one level of nesting, so we set the current section
                # to be what used to be the parent section
                hierarchy.pop()
                section = section_stack.pop()
                continue

            # parse defintinitions of stuff having a name such as fields
            # or opening groups and repeats
            if name is None:
                continue

            if data_type == 'begin_group':
                group_stack.append(group)
                group = FormGroup.from_json_definition(data_definition)
                # We go down in one level on nesting, so save the parent group.
                # Parent maybe None, in that case we are at the top level.
                hierarchy.append(group)
                continue

            if data_type == 'begin_repeat':
                # We go down in one level on nesting, so save the parent section.
                # Parent maybe None, in that case we are at the top level.
                parent_section = section

                section = FormSection.from_json_definition(data_definition,
                                                           hierarchy,
                                                           parent=parent_section)
                self.sections[section.name] = section
                hierarchy.append(section)
                section_stack.append(parent_section)
                parent_section.children.append(section)
                continue

            # If we are here, it's a regular field
            # Get the the data name and type
            field = FormField.from_json_definition(data_definition,
                                                   hierarchy, section,
                                                   field_choices,
                                                   translations=self.translations)
            section.fields[field.name] = field

            _f = fields_by_name[field.name]
            _labels = LabelStruct()

            if 'label' in _f:
                if not isinstance(_f['label'], list):
                    _f['label'] = [_f['label']]
                _labels = LabelStruct(labels=_f['label'],
                                      translations=self.translations)

            field.labels = _labels
            assert 'labels' not in _f

    def __repr__(self):
        return '<FormVersion %s>' % self._stats()

    def _stats(self):
        _stats = OrderedDict()
        _stats['id_string'] = self._get_id_string()
        _stats['version'] = self.id
        _stats['row_count'] = len(self.schema.get('content', {}).get('survey', []))
        # returns stats in the format [ key="value" ]
        return '\n\t'.join(map(lambda key: '%s="%s"' % (key, str(_stats[key])),
                               _stats.keys()))

    def to_dict(self, **opts):
        return flatten_content(self.schema['content'], **opts)

    # TODO: find where to move that
    def _load_submission_xml(self, xml):
        raise NotImplementedError("This doesn't work now that submissions "
                                  "are out of the class. Port it to Export.")
        _xmljson = parse_xml_to_xmljson(xml)
        _rootatts = _xmljson.get('attributes', {})
        _id_string = _rootatts.get('id_string')
        _version_id = _rootatts.get('version')
        if _id_string != self._get_id_string():
            raise ValueError('submission id_string does not match: %s != %s' %
                             (self._get_id_string(), _id_string))
        if _version_id != self.form_pack.id_string:
            raise ValueError('mismatching version id %s != %s' %
                             (self.form_pack.id_string, _version_id))
        self.submissions.append(FormSubmission.from_xml(_xmljson, self))

    def lookup(self, prop, default=None):
        result = getattr(self, prop, None)
        if result is None:
            result = self.form_pack.lookup(prop, default=default)
        return result

    def _get_root_node_name(self):
        return self.lookup('root_node_name', default='data')

    def _get_id_string(self):
        return self.lookup('id_string')

    def _get_title(self):
        '''
        if formversion has no name, uses form's name
        '''
        if self.title is None:
            return self.form_pack.title
        return self.title

    def get_labels(self, lang=UNTRANSLATED, group_sep=None):
        """ Returns a mapping of labels for {section: [field_label, ...]...}

            Sections and fields labels can be set to use their slug name,
            their lone label, or one of the translated labels.

            If a field is part of a group and a group separator is passed,
            the group label is retrieved, possibly translated, and
            prepended to the field label itself.
        """

        all_labels = OrderedDict()
        for section_name, section in self.sections.items():

            section_label = section.labels.get(lang) or section_name
            section_labels = all_labels[section_label] = []

            for field_name, field in section.fields.items():
                    section_labels.extend(field.get_labels(lang, group_sep))

        return all_labels

    def to_xml(self, warnings=None):
        # todo: collect warnings from pyxform compilation when a list is passed
        survey = formversion_pyxform(
            self.to_dict(remove_sheets=['translations', 'translated'],
                         )
                                     )
        title = self._get_title()

        if title is None:
            raise ValueError('cannot create xml on a survey with no title.')

        survey.update({
            'name': self.lookup('root_node_name', 'data'),
            'id_string': self.lookup('id_string'),
            'title': self.lookup('title'),
            'version': self.lookup('id'),
        })
        return survey._to_pretty_xml().encode('utf-8')
Пример #24
0
class AttrTree(object):
    """
    An AttrTree offers convenient, multi-level attribute access for
    collections of objects. AttrTree objects may also be combined
    together using the update method or merge classmethod. Here is an
    example of adding a ViewableElement to an AttrTree and accessing it:

    >>> t = AttrTree()
    >>> t.Example.Path = 1
    >>> t.Example.Path                             #doctest: +ELLIPSIS
    1
    """
    _disabled_prefixes = []  # Underscore attributes that should be
    _sanitizer = util.sanitize_identifier

    @classmethod
    def merge(cls, trees):
        """
        Merge a collection of AttrTree objects.
        """
        first = trees[0]
        for tree in trees:
            first.update(tree)
        return first

    def __dir__(self):
        """
        The _dir_mode may be set to 'default' or 'user' in which case
        only the child nodes added by the user are listed.
        """
        dict_keys = self.__dict__.keys()
        if self.__dict__['_dir_mode'] == 'user':
            return self.__dict__['children']
        else:
            return dir(type(self)) + list(dict_keys)

    def __init__(self,
                 items=None,
                 identifier=None,
                 parent=None,
                 dir_mode='default'):
        """
        identifier: A string identifier for the current node (if any)
        parent:     The parent node (if any)
        items:      Items as (path, value) pairs to construct
                    (sub)tree down to given leaf values.

        Note that the root node does not have a parent and does not
        require an identifier.
        """
        self.__dict__['parent'] = parent
        self.__dict__['identifier'] = type(self)._sanitizer(identifier,
                                                            escape=False)
        self.__dict__['children'] = []
        self.__dict__['_fixed'] = False
        self.__dict__['_dir_mode'] = dir_mode  # Either 'default' or 'user'

        fixed_error = 'No attribute %r in this AttrTree, and none can be added because fixed=True'
        self.__dict__['_fixed_error'] = fixed_error
        self.__dict__['data'] = OrderedDict()
        items = items.items() if isinstance(items, OrderedDict) else items
        # Python 3
        items = list(items) if items else items
        items = [] if not items else items
        for path, item in items:
            self.set_path(path, item)

    @property
    def path(self):
        "Returns the path up to the root for the current node."
        if self.parent:
            return '.'.join([self.parent.path, str(self.identifier)])
        else:
            return self.identifier if self.identifier else self.__class__.__name__

    @property
    def fixed(self):
        "If fixed, no new paths can be created via attribute access"
        return self.__dict__['_fixed']

    @fixed.setter
    def fixed(self, val):
        self.__dict__['_fixed'] = val

    def update(self, other):
        """
        Updated the contents of the current AttrTree with the
        contents of a second AttrTree.
        """
        if not isinstance(other, AttrTree):
            raise Exception('Can only update with another AttrTree type.')
        fixed_status = (self.fixed, other.fixed)
        (self.fixed, other.fixed) = (False, False)
        for identifier, element in other.items():
            if identifier not in self.data:
                self[identifier] = element
            else:
                self[identifier].update(element)
        (self.fixed, other.fixed) = fixed_status

    def set_path(self, path, val):
        """
        Set the given value at the supplied path where path is either
        a tuple of strings or a string in A.B.C format.
        """
        path = tuple(path.split('.')) if isinstance(path, str) else tuple(path)

        disallowed = [
            p for p in path if not type(self)._sanitizer.allowable(p)
        ]
        if any(disallowed):
            raise Exception("Attribute strings in path elements cannot be "
                            "correctly escaped : %s" %
                            ','.join(repr(el) for el in disallowed))
        if len(path) > 1:
            attrtree = self.__getattr__(path[0])
            attrtree.set_path(path[1:], val)
        else:
            self.__setattr__(path[0], val)

    def filter(self, path_filters):
        """
        Filters the loaded AttrTree using the supplied path_filters.
        """
        if not path_filters: return self

        # Convert string path filters
        path_filters = [
            tuple(pf.split('.')) if not isinstance(pf, tuple) else pf
            for pf in path_filters
        ]

        # Search for substring matches between paths and path filters
        new_attrtree = self.__class__()
        for path, item in self.data.items():
            if any([
                    all([subpath in path for subpath in pf])
                    for pf in path_filters
            ]):
                new_attrtree.set_path(path, item)

        return new_attrtree

    def _propagate(self, path, val):
        """
        Propagate the value up to the root node.
        """
        if val == '_DELETE':
            if path in self.data:
                del self.data[path]
            else:
                items = [(key, v) for key, v in self.data.items()
                         if not all(k == p for k, p in zip(key, path))]
                self.data = OrderedDict(items)
        else:
            self.data[path] = val
        if self.parent is not None:
            self.parent._propagate((self.identifier, ) + path, val)

    def __setitem__(self, identifier, val):
        """
        Set a value at a child node with given identifier. If at a root
        node, multi-level path specifications is allowed (i.e. 'A.B.C'
        format or tuple format) in which case the behaviour matches
        that of set_path.
        """
        if isinstance(identifier, str) and '.' not in identifier:
            self.__setattr__(identifier, val)
        elif isinstance(identifier, str) and self.parent is None:
            self.set_path(tuple(identifier.split('.')), val)
        elif isinstance(identifier, tuple) and self.parent is None:
            self.set_path(identifier, val)
        else:
            raise Exception(
                "Multi-level item setting only allowed from root node.")

    def __getitem__(self, identifier):
        """
        For a given non-root node, access a child element by identifier.

        If the node is a root node, you may also access elements using
        either tuple format or the 'A.B.C' string format.
        """
        split_label = (tuple(identifier.split('.')) if isinstance(
            identifier, str) else tuple(identifier))
        if len(split_label) == 1:
            identifier = split_label[0]
            if identifier in self.children:
                return self.__dict__[identifier]
            else:
                raise KeyError(identifier)
        path_item = self
        for identifier in split_label:
            path_item = path_item[identifier]
        return path_item

    def __delitem__(self, identifier):
        split_label = (tuple(identifier.split('.')) if isinstance(
            identifier, str) else tuple(identifier))
        if len(split_label) == 1:
            identifier = split_label[0]
            if identifier in self.children:
                del self.__dict__[identifier]
                self.children.pop(self.children.index(identifier))
            else:
                raise KeyError(identifier)
            self._propagate(split_label, '_DELETE')
        else:
            path_item = self
            for i, identifier in enumerate(split_label[:-1]):
                path_item = path_item[identifier]
            del path_item[split_label[-1]]

    def __setattr__(self, identifier, val):
        # Getattr is skipped for root and first set of children
        shallow = (self.parent is None or self.parent.parent is None)

        if util.tree_attribute(identifier) and self.fixed and shallow:
            raise AttributeError(self._fixed_error % identifier)

        super(AttrTree, self).__setattr__(identifier, val)

        if util.tree_attribute(identifier):
            if not identifier in self.children:
                self.children.append(identifier)
            self._propagate((identifier, ), val)

    def __getattr__(self, identifier):
        """
        Access a identifier from the AttrTree or generate a new AttrTree
        with the chosen attribute path.
        """
        try:
            return super(AttrTree, self).__getattr__(identifier)
        except AttributeError:
            pass

        # Attributes starting with __ get name mangled
        if identifier.startswith(
                '_' + type(self).__name__) or identifier.startswith('__'):
            raise AttributeError('Attribute %s not found.' % identifier)
        elif self.fixed == True:
            raise AttributeError(self._fixed_error % identifier)

        if not any(
                identifier.startswith(prefix)
                for prefix in type(self)._disabled_prefixes):
            sanitized = type(self)._sanitizer(identifier, escape=False)
        else:
            sanitized = identifier

        if sanitized in self.children:
            return self.__dict__[sanitized]

        if not sanitized.startswith('_') and util.tree_attribute(identifier):
            self.children.append(sanitized)
            dir_mode = self.__dict__['_dir_mode']
            child_tree = self.__class__(identifier=sanitized,
                                        parent=self,
                                        dir_mode=dir_mode)
            self.__dict__[sanitized] = child_tree
            return child_tree
        else:
            raise AttributeError('%r object has no attribute %s.' %
                                 (type(self).__name__, identifier))

    def __iter__(self):
        return iter(self.data.values())

    def __contains__(self, name):
        return name in self.children or name in self.data

    def __len__(self):
        return len(self.data)

    def get(self, identifier, default=None):
        """Get a node of the AttrTree using its path string.

        Args:
            identifier: Path string of the node to return
            default: Value to return if no node is found

        Returns:
            The indexed node of the AttrTree
        """
        split_label = (tuple(identifier.split('.')) if isinstance(
            identifier, str) else tuple(identifier))
        if len(split_label) == 1:
            identifier = split_label[0]
            return self.__dict__.get(identifier, default)
        path_item = self
        for identifier in split_label:
            if path_item == default or path_item is None:
                return default
            path_item = path_item.get(identifier, default)
        return path_item

    def keys(self):
        "Keys of nodes in the AttrTree"
        return list(self.data.keys())

    def items(self):
        "Keys and nodes of the AttrTree"
        return list(self.data.items())

    def values(self):
        "Nodes of the AttrTree"
        return list(self.data.values())

    def pop(self, identifier, default=None):
        """Pop a node of the AttrTree using its path string.

        Args:
            identifier: Path string of the node to return
            default: Value to return if no node is found

        Returns:
            The node that was removed from the AttrTree
        """
        if identifier in self.children:
            item = self[identifier]
            self.__delitem__(identifier)
            return item
        else:
            return default

    def __repr__(self):
        return PrettyPrinter.pprint(self)
Пример #25
0
 def test_setitem(self):
     od = OrderedDict([('d', 1), ('b', 2), ('c', 3), ('a', 4), ('e', 5)])
     od['c'] = 10  # existing element
     od['f'] = 20  # new element
     self.assertEqual(list(od.items()), [('d', 1), ('b', 2), ('c', 10),
                                         ('a', 4), ('e', 5), ('f', 20)])
Пример #26
0
jieba.enable_parallel(4)

big_dict = OrderedDict()        
for each_link in wechat_links(driver):
    print(each_link)
    article = get_article(each_link)
    if article is not None:
        for each_word_cut in word_cuts(article):
            if len(each_word_cut) > 1:
                if big_dict.get(each_word_cut) is None:
                    big_dict[each_word_cut] = 1
                else:
                    big_dict[each_word_cut] += 1
                    
driver.quit()
big_dict = sorted(big_dict.items(), key=lambda d: d[1], reverse=True)

now = datetime.datetime.now()
today = now.strftime('%Y%m%d%H%M%S')
pfile = open("wechat_word_cut"+today+".pkl", "wb", buffering=1024)
pfile.write(dumps(big_dict))
pfile.close()
f = open("wechat_word_cut"+today+".csv", "wb", buffering=1024)
for each_word_cut, word_count in big_dict:
    line = each_word_cut + "," + str(word_count) + chr(10)
    f.write(line.encode('utf-8'))
f.close()



Пример #27
0
class TigerXMLCorpusReader(CorpusReader):
	"""Corpus reader for the Tiger XML format."""

	def blocks(self):
		"""
		:returns: a list of strings containing the raw representation of
			trees in the treebank."""
		if self._block_cache is None:
			self._block_cache = OrderedDict(self._read_blocks())
		return OrderedDict((n, ElementTree.tostring(a))
				for n, a in self._block_cache.items())

	def _read_blocks(self):
		for filename in self._filenames:
			# iterator over elements in XML  file
			context = ElementTree.iterparse(filename,
					events=('start', 'end'))
			_, root = next(context)  # event == 'start' of root element
			for event, elem in context:
				if event == 'end' and elem.tag == 's':
					yield elem.get('id'), elem
				root.clear()

	def _parse(self, block):
		"""Translate Tiger XML structure to the fields of export format."""
		nodes = OrderedDict()
		root = block.find('graph').get('root')
		for term in block.find('graph').find('terminals'):
			fields = nodes.setdefault(term.get('id'), 6 * [None])
			fields[WORD] = term.get('word')
			fields[LEMMA] = term.get('lemma')
			fields[TAG] = term.get('pos')
			fields[MORPH] = term.get('morph')
			fields[PARENT] = '0' if term.get('id') == root else None
			nodes[term.get('id')] = fields
		for nt in block.find('graph').find('nonterminals'):
			if nt.get('id') == root:
				ntid = '0'
			else:
				fields = nodes.setdefault(nt.get('id'), 6 * [None])
				ntid = nt.get('id').split('_')[-1]
				fields[WORD] = '#' + ntid
				fields[TAG] = nt.get('cat')
				fields[LEMMA] = fields[MORPH] = '--'
			for edge in nt:
				idref = edge.get('idref')
				nodes.setdefault(idref, 6 * [None])
				if edge.tag == 'edge':
					if nodes[idref][FUNC] is not None:
						raise ValueError('%s already has a parent: %r'
								% (idref, nodes[idref]))
					nodes[idref][FUNC] = edge.get('label')
					nodes[idref][PARENT] = ntid
				elif edge.tag == 'secedge':
					nodes[idref].extend((edge.get('label'), ntid))
				else:
					raise ValueError("expected 'edge' or 'secedge' tag.")
		for idref in nodes:
			if nodes[idref][PARENT] is None:
				raise ValueError('%s does not have a parent: %r' % (
						idref, nodes[idref]))
		item = exporttree(
				['\t'.join(a) for a in nodes.values()],
				self.functions, self.morphology, self.lemmas)
		item.tree.label = nodes[root][TAG]
		item.block = ElementTree.tostring(block)
		return item
Пример #28
0
    def recursive_processing(self, base_dir, target_dir, it):
        """Method to recursivly process the notebooks in the `base_dir`

        Parameters
        ----------
        base_dir: str
            Path to the base example directory (see the `examples_dir`
            parameter for the :class:`Gallery` class)
        target_dir: str
            Path to the output directory for the rst files (see the
            `gallery_dirs` parameter for the :class:`Gallery` class)
        it: iterable
            The iterator over the subdirectories and files in `base_dir`
            generated by the :func:`os.walk` function"""
        try:
            file_dir, dirs, files = next(it)
        except StopIteration:
            return '', []
        readme_files = {'README.md', 'README.rst', 'README.txt'}
        if readme_files.intersection(files):
            foutdir = file_dir.replace(base_dir, target_dir)
            create_dirs(foutdir)
            this_nbps = [
                NotebookProcessor(
                    infile=f,
                    outfile=os.path.join(foutdir, os.path.basename(f)),
                    disable_warnings=self.disable_warnings,
                    preprocess=((self.preprocess is True
                                 or f in self.preprocess)
                                and not (self.dont_preprocess is True
                                         or f in self.dont_preprocess)),
                    clear=((self.clear is True or f in self.clear)
                           and not (self.dont_clear is True
                                    or f in self.dont_clear)),
                    code_example=self.code_examples.get(f),
                    supplementary_files=self.supplementary_files.get(f),
                    other_supplementary_files=self.osf.get(f),
                    thumbnail_figure=self.thumbnail_figures.get(f),
                    url=self.get_url(f.replace(base_dir, '')),
                    binder_url=self.get_binder_url(f.replace(base_dir, '')),
                    **self._nbp_kws)
                for f in map(lambda f: os.path.join(file_dir, f),
                             filter(self.pattern.match, files))
            ]
            readme_file = next(iter(readme_files.intersection(files)))
        else:
            return '', []
        labels = OrderedDict()
        this_label = 'gallery_' + foutdir.replace(os.path.sep, '_')
        if this_label.endswith('_'):
            this_label = this_label[:-1]
        for d in dirs:
            label, nbps = self.recursive_processing(base_dir, target_dir, it)
            if label:
                labels[label] = nbps
        s = ".. _%s:\n\n" % this_label

        if readme_file.endswith('.md'):
            s += spr.check_output([
                'pandoc',
                os.path.join(file_dir, readme_file), '-t', 'rst'
            ]).decode('utf-8').rstrip() + '\n\n'
        else:
            with open(os.path.join(file_dir, readme_file)) as f:
                s += f.read().rstrip() + '\n\n'

        if self.toctree_depth:
            s += "\n\n.. toctree::"
            if self.toctree_depth > 0:
                s += "\n    :maxdepth: %d" % self.toctree_depth
            s += "\n\n"
            s += ''.join(
                '    %s\n' %
                os.path.splitext(os.path.basename(nbp.get_out_file()))[0]
                for nbp in this_nbps)
            for d in dirs:
                findex = os.path.join(d, 'index.rst')
                if os.path.exists(os.path.join(foutdir, findex)):
                    s += '    %s\n' % os.path.splitext(findex)[0]

            s += '\n'

        for nbp in this_nbps:
            code_div = nbp.code_div
            if code_div is not None:
                s += code_div + '\n'
            else:
                s += nbp.thumbnail_div + '\n'
        s += "\n.. raw:: html\n\n    <div style='clear:both'></div>\n"
        for label, nbps in labels.items():
            s += '\n.. only:: html\n\n    .. rubric:: :ref:`%s`\n\n' % (label)
            for nbp in nbps:
                code_div = nbp.code_div
                if code_div is not None:
                    s += code_div + '\n'
                else:
                    s += nbp.thumbnail_div + '\n'
            s += "\n.. raw:: html\n\n    <div style='clear:both'></div>\n"

        s += '\n'

        with open(os.path.join(foutdir, 'index.rst'), 'w') as f:
            f.write(s)
        return this_label, list(chain(this_nbps, *labels.values()))
Пример #29
0
class MultiDimensionalMapping(Dimensioned):
    """
    An MultiDimensionalMapping is a Dimensioned mapping (like a
    dictionary or array) that uses fixed-length multidimensional
    keys. This behaves like a sparse N-dimensional array that does not
    require a dense sampling over the multidimensional space.

    If the underlying value for each (key,value) pair also supports
    indexing (such as a dictionary, array, or list), fully qualified
    (deep) indexing may be used from the top level, with the first N
    dimensions of the index selecting a particular Dimensioned object
    and the remaining dimensions indexing into that object.

    For instance, for a MultiDimensionalMapping with dimensions "Year"
    and "Month" and underlying values that are 2D floating-point
    arrays indexed by (r,c), a 2D array may be indexed with x[2000,3]
    and a single floating-point number may be indexed as
    x[2000,3,1,9].

    In practice, this class is typically only used as an abstract base
    class, because the NdMapping subclass extends it with a range of
    useful slicing methods for selecting subsets of the data. Even so,
    keeping the slicing support separate from the indexing and data
    storage methods helps make both classes easier to understand.
    """

    group = param.String(default='MultiDimensionalMapping')

    key_dimensions = param.List(default=[Dimension("Default")], constant=True)

    data_type = None  # Optional type checking of elements
    _deep_indexable = False
    _sorted = True

    def __init__(self, initial_items=None, **params):
        if isinstance(initial_items, NdMapping):
            map_type = type(initial_items)
            own_params = self.params()
            new_params = dict(initial_items.get_param_values(onlychanged=True))
            if new_params.get('group') == map_type.__name__:
                new_params.pop('group')
            params = dict(
                {
                    name: value
                    for name, value in new_params.items() if name in own_params
                }, **params)
        super(MultiDimensionalMapping, self).__init__(OrderedDict(), **params)

        self._next_ind = 0
        self._check_key_type = True
        self._cached_index_types = [d.type for d in self.key_dimensions]
        self._cached_index_values = {
            d.name: d.values
            for d in self.key_dimensions
        }
        self._cached_categorical = any(d.values for d in self.key_dimensions)

        if isinstance(initial_items, tuple):
            self._add_item(initial_items[0], initial_items[1])
        elif initial_items is not None:
            self.update(OrderedDict(initial_items))

    def _item_check(self, dim_vals, data):
        """
        Applies optional checks to individual data elements before
        they are inserted ensuring that they are of a certain
        type. Subclassed may implement further element restrictions.
        """
        if self.data_type is not None and not isinstance(data, self.data_type):
            if isinstance(self.data_type, tuple):
                data_type = tuple(dt.__name__ for dt in self.data_type)
            else:
                data_type = self.data_type.__name__
            raise TypeError(
                '{slf} does not accept {data} type, data elements have '
                'to be a {restr}.'.format(slf=type(self).__name__,
                                          data=type(data).__name__,
                                          restr=data_type))
        elif not len(dim_vals) == self.ndims:
            raise KeyError('Key has to match number of dimensions.')

    def _add_item(self, dim_vals, data, sort=True):
        """
        Adds item to the data, applying dimension types and ensuring
        key conforms to Dimension type and values.
        """
        if not isinstance(dim_vals, tuple):
            dim_vals = (dim_vals, )

        self._item_check(dim_vals, data)

        # Apply dimension types
        dim_types = zip(self._cached_index_types, dim_vals)
        dim_vals = tuple(v if t is None else t(v) for t, v in dim_types)

        # Check and validate for categorical dimensions
        if self._cached_categorical:
            valid_vals = zip(self._cached_index_names, dim_vals)
        else:
            valid_vals = []
        for dim, val in valid_vals:
            vals = self._cached_index_values[dim]
            if vals and val not in vals:
                raise KeyError('%s Dimension value %s not in'
                               ' specified Dimension values.' %
                               (dim, repr(val)))

        # Updates nested data structures rather than simply overriding them.
        if ((dim_vals in self.data) and isinstance(self.data[dim_vals],
                                                   (NdMapping, OrderedDict))):
            self.data[dim_vals].update(data)
        else:
            self.data[dim_vals] = data

        if sort:
            self._resort()

    def _apply_key_type(self, keys):
        """
        If a type is specified by the corresponding key dimension,
        this method applies the type to the supplied key.
        """
        typed_key = ()
        for dim, key in zip(self.key_dimensions, keys):
            key_type = dim.type
            if key_type is None:
                typed_key += (key, )
            elif isinstance(key, slice):
                sl_vals = [key.start, key.stop, key.step]
                typed_key += (slice(*[
                    key_type(el) if el is not None else None for el in sl_vals
                ]), )
            elif key is Ellipsis:
                typed_key += (key, )
            elif isinstance(key, list):
                typed_key += ([key_type(k) for k in key], )
            else:
                typed_key += (key_type(key), )
        return typed_key

    def _split_index(self, key):
        """
        Partitions key into key and deep dimension groups. If only key
        indices are supplied, the data is indexed with an empty tuple.
        """
        if not isinstance(key, tuple):
            key = (key, )
        map_slice = key[:self.ndims]
        if self._check_key_type:
            map_slice = self._apply_key_type(map_slice)
        if len(key) == self.ndims:
            return map_slice, ()
        else:
            return map_slice, key[self.ndims:]

    def _dataslice(self, data, indices):
        """
        Returns slice of data element if the item is deep
        indexable. Warns if attempting to slice an object that has not
        been declared deep indexable.
        """
        if isinstance(data, Dimensioned):
            return data[indices]
        elif len(indices) > 0:
            self.warning('Cannot index into data element, extra data'
                         ' indices ignored.')
        return data

    def _resort(self):
        """
        Sorts data by key using usual Python tuple sorting semantics
        or sorts in categorical order for any categorical Dimensions.
        """
        sortkws = {}
        dimensions = self.key_dimensions
        if self._cached_categorical:
            sortkws['key'] = lambda x: tuple(dimensions[i].values.index(x[0][
                i]) if dimensions[i].values else x[0][i]
                                             for i in range(self.ndims))
        self.data = OrderedDict(sorted(self.data.items(), **sortkws))

    def groupby(self,
                dimensions,
                container_type=None,
                group_type=None,
                **kwargs):
        """
        Splits the mapping into groups by key dimension which are then
        returned together in a mapping of class container_type. The
        individual groups are of the same type as the original map.
        """
        if self.ndims == 1:
            self.warning('Cannot split Map with only one dimension.')
            return self

        container_type = container_type if container_type else type(self)
        group_type = group_type if group_type else type(self)
        dims, inds = zip(*((self.get_dimension(dim),
                            self.get_dimension_index(dim))
                           for dim in dimensions))
        inames, idims = zip(*((dim.name, dim) for dim in self.key_dimensions
                              if not dim.name in dimensions))
        selects = unique_iterator(
            itemgetter(*inds)(key) if len(inds) > 1 else (key[inds[0]], )
            for key in self.data.keys())
        groups = [
            (sel,
             group_type(
                 self.select(**dict(zip(dimensions, sel))).reindex(inames),
                 **kwargs)) for sel in selects
        ]
        return container_type(groups, key_dimensions=dims)

    def add_dimension(self, dimension, dim_pos, dim_val, **kwargs):
        """
        Create a new object with an additional key dimensions along
        which items are indexed. Requires the dimension name, the
        desired position in the key_dimensions and a key value that
        will be used across the dimension. This is particularly useful
        for merging several mappings together.
        """
        if isinstance(dimension, str):
            dimension = Dimension(dimension)

        if dimension.name in self._cached_index_names:
            raise Exception(
                '{dim} dimension already defined'.format(dim=dimension.name))

        dimensions = self.key_dimensions[:]
        dimensions.insert(dim_pos, dimension)

        items = OrderedDict()
        for key, val in self.data.items():
            new_key = list(key)
            new_key.insert(dim_pos, dim_val)
            items[tuple(new_key)] = val

        return self.clone(items, key_dimensions=dimensions, **kwargs)

    def drop_dimension(self, dim):
        """
        Returns a new mapping with the named dimension
        removed. Ensures that the dropped dimension is constant (owns
        only a single key value) before dropping it.
        """
        dim_labels = [d for d in self._cached_index_names if d != dim]
        return self.reindex(dim_labels)

    def dimension_values(self, dimension):
        "Returns the values along the specified dimension."
        all_dims = [d.name for d in self.dimensions()]
        if isinstance(dimension, int):
            dimension = all_dims[dimension]

        if dimension in self._cached_index_names:
            values = [
                k[self.get_dimension_index(dimension)]
                for k in self.data.keys()
            ]
        elif dimension in all_dims:
            values = [
                el.dimension_values(dimension) for el in self
                if dimension in el.dimensions()
            ]
            values = np.concatenate(values)
        else:
            raise Exception('Dimension %s not found.' % dimension)
        return values

    def reindex(self, dimension_labels=[], force=False):
        """
        Create a new object with a re-ordered or reduced set of key
        dimensions.

        Reducing the number of key dimensions will discard information
        from the keys. All data values are accessible in the newly
        created object as the new labels must be sufficient to address
        each value uniquely.
        """
        if not len(dimension_labels):
            dimension_labels = [
                d for d in self._cached_index_names
                if not len(set(self.dimension_values(d))) == 1
            ]

        indices = [self.get_dimension_index(el) for el in dimension_labels]

        keys = [tuple(k[i] for i in indices) for k in self.data.keys()]
        reindexed_items = OrderedDict(
            (k, v) for (k, v) in zip(keys, self.data.values()))
        reduced_dims = set(
            self._cached_index_names).difference(dimension_labels)
        dimensions = [
            self.get_dimension(d) for d in dimension_labels
            if d not in reduced_dims
        ]

        if len(set(keys)) != len(keys) and not force:
            raise Exception(
                "Given dimension labels not sufficient to address all values uniquely"
            )

        if len(keys):
            constant_dimensions = {
                self.get_dimension(d): self.dimension_values(d)[0]
                for d in reduced_dims
            }
        else:
            constant_dimensions = {}
        return self.clone(reindexed_items,
                          key_dimensions=dimensions,
                          constant_dimensions=constant_dimensions)

    @property
    def last(self):
        "Returns the item highest data item along the map dimensions."
        return list(self.data.values())[-1] if len(self) else None

    @property
    def last_key(self):
        "Returns the last key value."
        return list(self.keys())[-1] if len(self) else None

    @property
    def info(self):
        """
        Prints information about the Dimensioned object, including the
        number and type of objects contained within it and information
        about its dimensions.
        """
        info_str = self.__class__.__name__ +\
                   " containing %d items of type %s\n" % (len(self.keys()),
                                                          type(self.values()[0]).__name__)
        info_str += ('-' * (len(info_str) - 1)) + "\n\n"
        for group in self._dim_groups:
            dimensions = getattr(self, group)
            if dimensions:
                info_str += '%s Dimensions: \n' % group.capitalize()
            for d in dimensions:
                dmin, dmax = self.range(d.name)
                if d.formatter:
                    dmin, dmax = d.formatter(dmin), d.formatter(dmax)
                info_str += '\t %s: %s...%s \n' % (str(d), dmin, dmax)
        print(info_str)

    def table(self, **kwargs):
        "Creates a table from the stored keys and data."

        table = None
        for key, value in self.data.items():
            value = value.table(**kwargs)
            for idx, (dim, val) in enumerate(zip(self.key_dimensions, key)):
                value = value.add_dimension(dim, idx, val)
            if table is None:
                table = value
            else:
                table.update(value)
        return table

    def dframe(self):
        "Creates a pandas DataFrame from the stored keys and data."
        try:
            import pandas
        except ImportError:
            raise Exception(
                "Cannot build a DataFrame without the pandas library.")
        labels = self._cached_index_names + [self.group]
        return pandas.DataFrame(
            [dict(zip(labels, k + (v, ))) for (k, v) in self.data.items()])

    def update(self, other):
        """
        Updates the current mapping with some other mapping or
        OrderedDict instance, making sure that they are indexed along
        the same set of dimensions. The order of key_dimensions
        remains unchanged after the update.
        """
        if isinstance(other, NdMapping):
            if self.key_dimensions != other.key_dimensions:
                raise KeyError("Cannot update with NdMapping that has"
                               " a different set of key dimensions.")
        for key, data in other.items():
            self._add_item(key, data, sort=False)
        self._resort()

    def keys(self):
        " Returns the keys of all the elements."
        if self.ndims == 1:
            return [k[0] for k in self.data.keys()]
        else:
            return list(self.data.keys())

    def values(self):
        " Returns the values of all the elements."
        return list(self.data.values())

    def items(self):
        "Returns all elements as a list in (key,value) format."
        return list(zip(list(self.keys()), list(self.values())))

    def get(self, key, default=None):
        "Standard get semantics for all mapping types"
        try:
            if key is None:
                return None
            return self[key]
        except:
            return default

    def pop(self, key, default=None):
        "Standard pop semantics for all mapping types"
        if not isinstance(key, tuple): key = (key, )
        return self.data.pop(key, default)

    def __getitem__(self, key):
        """
        Allows multi-dimensional indexing in the order of the
        specified key dimensions, passing any additional indices to
        the data elements.
        """
        if key in [Ellipsis, ()]:
            return self
        map_slice, data_slice = self._split_index(key)
        return self._dataslice(self.data[map_slice], data_slice)

    def __setitem__(self, key, value):
        self._add_item(key, value)

    def __str__(self):
        return repr(self)

    def __iter__(self):
        return iter(self.values())

    def __contains__(self, key):
        if self.ndims == 1:
            return key in self.data.keys()
        else:
            return key in self.keys()

    def __len__(self):
        return len(self.data)
Пример #30
0
    def get_fields_labels_tags_for_all_versions(self,
                                                lang=UNSPECIFIED_TRANSLATION,
                                                group_sep="/",
                                                hierarchy_in_labels=False,
                                                multiple_select="both",
                                                tag_cols_for_header=None):
        """ Return 3 mappings containing field, labels, and tags by section

            This is needed because when making an export for several
            versions of the same form, fields get added, removed, and
            edited. Hence we pre-generate mappings containing labels,
            fields, and tags for all versions so we can use them later as a
            canvas to keep the export coherent.

            Labels are used as column headers.

            Field are used to create rows of data from submission.

            Tags specified by `tag_cols_for_header` are included as additional
            column headers (in CSV and XLSX exports only).
        """

        if tag_cols_for_header is None:
            tag_cols_for_header = []
        try:
            tag_cols_and_seps = {
                col: TAG_COLUMNS_AND_SEPARATORS[col]
                    for col in tag_cols_for_header
            }
        except KeyError as e:
            raise RuntimeError(
                '{} is not in TAG_COLUMNS_AND_SEPARATORS'.format(e.message))

        section_fields = OrderedDict()  # {section: [(name, field), (name...))]}
        section_labels = OrderedDict()  # {section: [field_label, field_label]}
        section_tags = OrderedDict()  # {section: [{column_name: tag_string, ...}, ...]}

        all_fields = self.formpack.get_fields_for_versions(self.versions)
        all_sections = {}

        # List of fields we generate ourself to add at the very ends
        # of the field list
        auto_fields = OrderedDict()

        for field in all_fields:
            section_fields.setdefault(field.section.name, []).append(
                (field.name, field)
            )
            section_labels.setdefault(field.section.name, []).append(
                field.get_labels(lang, group_sep,
                                 hierarchy_in_labels,
                                 multiple_select)
            )
            all_sections[field.section.name] = field.section

        for section_name, section in all_sections.items():
            # Append optional additional fields
            auto_field_names = auto_fields[section_name] = []
            if section.children or self.force_index:
                auto_field_names.append('_index')

            if section.parent:
                auto_field_names.append('_parent_table_name')
                auto_field_names.append('_parent_index')
                # Add extra fields
                for copy_field in self.copy_fields:
                    if isclass(copy_field):
                        auto_field_names.append(
                            "_submission_{}".format(copy_field.FIELD_NAME))
                    else:
                        auto_field_names.append(
                            "_submission_{}".format(copy_field))


        # Flatten field labels and names. Indeed, field.get_labels()
        # and self.names return a list because a multiple select field can
        # have several values. We needed them grouped to insert them at the
        # proper index, but now we want just list of all of them.

        # Flatten all the names for all the value of all the fields
        for section, fields in list(section_fields.items()):
            name_lists = []
            tags = []
            for _field_data in fields:
                if len(_field_data) != 2:
                    # e.g. [u'location', u'_location_latitude',...]
                    continue
                (field_name, field) = _field_data
                name_lists.append(field.value_names)

                # Add the tags for this field. If the field has multiple
                # labels, add the tags once for each label
                tags.extend(
                    [flatten_tag_list(field.tags, tag_cols_and_seps)] *
                        len(field.value_names)
                )

            names = [name for name_list in name_lists for name in name_list]

            # add auto fields:
            names.extend(auto_fields[section])
            tags.extend([{}] * len(auto_fields[section]))

            section_fields[section] = names
            section_tags[section] = tags

        # Flatten all the labels for all the headers of all the fields
        for section, labels in list(section_labels.items()):
            labels = [label for label_group in labels for label in label_group]

            # add auto fields (names and labels are the same)
            labels.extend(auto_fields[section])

            section_labels[section] = labels

        return section_fields, section_labels, section_tags
Пример #31
0
class TigerXMLCorpusReader(CorpusReader):
    """Corpus reader for the Tiger XML format."""

    def blocks(self):
        """
        :returns: a list of strings containing the raw representation of
                trees in the treebank."""
        if self._block_cache is None:
            self._block_cache = OrderedDict(self._read_blocks())
        return OrderedDict((n, ElementTree.tostring(a))
                           for n, a in self._block_cache.items())

    def _read_blocks(self):
        for filename in self._filenames:
            # iterator over elements in XML  file
            context = ElementTree.iterparse(filename,
                                            events=('start', 'end'))
            _, root = next(context)  # event == 'start' of root element
            for event, elem in context:
                if event == 'end' and elem.tag == 's':
                    yield elem.get('id'), elem
                root.clear()

    def _parse(self, block):
        """Translate Tiger XML structure to the fields of export format."""
        nodes = OrderedDict()
        root = block.find('graph').get('root')
        for term in block.find('graph').find('terminals'):
            fields = nodes.setdefault(term.get('id'), 6 * [None])
            fields[WORD] = term.get('word')
            fields[LEMMA] = term.get('lemma')
            fields[TAG] = term.get('pos')
            fields[MORPH] = term.get('morph')
            fields[PARENT] = '0' if term.get('id') == root else None
            fields[FUNC] = '--'
            nodes[term.get('id')] = fields
        for nt in block.find('graph').find('nonterminals'):
            if nt.get('id') == root:
                ntid = '0'
            else:
                fields = nodes.setdefault(nt.get('id'), 6 * [None])
                ntid = nt.get('id').split('_')[-1]
                fields[WORD] = '#' + ntid
                fields[TAG] = nt.get('cat')
                fields[LEMMA] = fields[MORPH] = fields[FUNC] = '--'
            for edge in nt:
                idref = edge.get('idref')
                nodes.setdefault(idref, 6 * [None])
                if edge.tag == 'edge':
                    if nodes[idref][FUNC] not in (None, '--'):
                        raise ValueError('%s already has a parent: %r'
                                         % (idref, nodes[idref]))
                    nodes[idref][FUNC] = edge.get('label')
                    nodes[idref][PARENT] = ntid
                elif edge.tag == 'secedge':
                    nodes[idref].extend((edge.get('label'), ntid))
                else:
                    raise ValueError("expected 'edge' or 'secedge' tag.")
        for idref in nodes:
            if nodes[idref][PARENT] is None:
                raise ValueError('%s does not have a parent: %r' % (
                    idref, nodes[idref]))
        item = exporttree(
            ['#BOS ' + block.get('id')]
            + ['\t'.join(a) for a in nodes.values()]
            + ['#EOS ' + block.get('id')],
            self.functions, self.morphology, self.lemmas)
        item.tree.label = root.split('_', 1)[1]
        item.block = ElementTree.tostring(block)
        return item
Пример #32
0
def startexp(
		prm,  # A DictObj with the structure of parser.DEFAULTS
		resultdir='results',
		rerun=False):
	"""Execute an experiment."""
	if rerun:
		if not os.path.exists(resultdir):
			raise ValueError('Directory %r does not exist.\n--rerun requires a'
					' directory with the grammar(s) of a previous experiment.'
					% resultdir)
	else:
		if os.path.exists(resultdir):
			raise ValueError('Directory %r exists.\n'
					'Use --rerun to parse with existing grammar '
					'and overwrite previous results.' % resultdir)
		os.mkdir(resultdir)

	# Log everything, and send it to stderr, in a format with just the message.
	formatstr = '%(message)s'
	if prm.verbosity == 0:
		logging.basicConfig(level=logging.WARNING, format=formatstr)
	elif prm.verbosity == 1:
		logging.basicConfig(level=logging.INFO, format=formatstr)
	elif prm.verbosity == 2:
		logging.basicConfig(level=logging.DEBUG, format=formatstr)
	elif 3 <= prm.verbosity <= 4:
		logging.basicConfig(level=5, format=formatstr)
	else:
		raise ValueError('verbosity should be >= 0 and <= 4. ')

	# also log to a file
	fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
	fileobj.setLevel(logging.DEBUG)
	fileobj.setFormatter(logging.Formatter(formatstr))
	logging.getLogger('').addHandler(fileobj)
	logging.info('Disco-DOP %s, running on Python %s',
			__version__, sys.version.split()[0])
	if not rerun:
		trees, sents, train_tagged_sents = loadtraincorpus(
				prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
				prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
				prm.transformations, prm.relationalrealizational)
	elif isinstance(prm.traincorpus.numsents, float):
		raise ValueError('need to specify number of training set sentences, '
				'not fraction, in rerun mode.')

	testsettb = treebank.READERS[prm.corpusfmt](
			prm.testcorpus.path, encoding=prm.testcorpus.encoding,
			headrules=prm.binarization.headrules,
			removeempty=prm.removeempty, morphology=prm.morphology,
			functions=prm.functions, ensureroot=prm.ensureroot)
	if isinstance(prm.testcorpus.numsents, float):
		prm.testcorpus.numsents = int(prm.testcorpus.numsents
				* len(testsettb.blocks()))
	if prm.testcorpus.skiptrain:
		prm.testcorpus.skip += (  # pylint: disable=maybe-no-member
				prm.traincorpus.numsents)  # pylint: disable=maybe-no-member

	test_blocks = OrderedDict()
	test_trees = OrderedDict()
	test_tagged_sents = OrderedDict()
	for n, item in testsettb.itertrees(
			prm.testcorpus.skip,
			prm.testcorpus.skip  # pylint: disable=no-member
			+ prm.testcorpus.numsents):
		if 1 <= len(item.sent) <= prm.testcorpus.maxwords:
			test_blocks[n] = item.block
			test_trees[n] = item.tree
			test_tagged_sents[n] = [(word, tag) for word, (_, tag)
					in zip(item.sent, sorted(item.tree.pos()))]
	logging.info('%d test sentences after length restriction <= %d',
			len(test_trees), prm.testcorpus.maxwords)
	lexmodel = None
	simplelexsmooth = False
	test_tagged_sents_mangled = test_tagged_sents
	if prm.postagging and prm.postagging.method in (
			'treetagger', 'stanford', 'frog'):
		if prm.postagging.method == 'treetagger':
			# these two tags are never given by tree-tagger,
			# so collect words whose tag needs to be overriden
			overridetags = ('PTKANT', 'PIDAT')
		elif prm.postagging.method == 'stanford':
			overridetags = ('PTKANT', )
		elif prm.postagging.method == 'frog':
			overridetags = ()
		taglex = defaultdict(set)
		for sent in train_tagged_sents:
			for word, tag in sent:
				taglex[word].add(tag)
		overridetagdict = {tag:
			{word for word, tags in taglex.items() if tags == {tag}}
			for tag in overridetags}
		tagmap = {'$(': '$[', 'PAV': 'PROAV'}
		test_tagged_sents_mangled = lexicon.externaltagging(
				prm.postagging.method, prm.postagging.model, test_tagged_sents,
				overridetagdict, tagmap)
		if prm.postagging.retag and not rerun:
			logging.info('re-tagging training corpus')
			sents_to_tag = OrderedDict(enumerate(train_tagged_sents))
			train_tagged_sents = lexicon.externaltagging(prm.postagging.method,
					prm.postagging.model, sents_to_tag, overridetagdict,
					tagmap).values()
			for tree, tagged in zip(trees, train_tagged_sents):
				for node in tree.subtrees(
						lambda n: len(n) == 1 and isinstance(n[0], int)):
					node.label = tagged[node[0]][1]
		usetags = True  # give these tags to parser
	elif prm.postagging and prm.postagging.method == 'unknownword':
		if not rerun:
			sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
			simplelexsmooth = prm.postagging.simplelexsmooth
		usetags = False  # make sure gold POS tags are not given to parser
	else:
		usetags = True  # give gold POS tags to parser

	# 0: test sentences as they should be handed to the parser,
	# 1: gold trees for evaluation purposes
	# 2: gold sents because test sentences may be mangled by unknown word model
	# 3: blocks from treebank file to reproduce the relevant part of the
	#   original treebank verbatim.
	testset = OrderedDict((n, (
				test_tagged_sents_mangled[n],
				test_trees[n],
				test_tagged_sents[n],
				block))
			for n, block in test_blocks.items())
	if not test_tagged_sents:
		raise ValueError('test corpus (selection) should be non-empty.')

	if rerun:
		trees, sents = [], []
	roots = {t.label for t in trees} | {test_trees[n].label for n in testset}
	if len(roots) != 1:
		raise ValueError('expected unique ROOT label: %r' % roots)
	top = roots.pop()
	funcclassifier = None

	if rerun:
		parser.readgrammars(resultdir, prm.stages, prm.postagging, top)
		if prm.predictfunctions:
			from sklearn.externals import joblib
			funcclassifier = joblib.load('%s/funcclassifier.pickle' % resultdir)
	else:
		logging.info('read training & test corpus')
		if prm.predictfunctions:
			from sklearn.externals import joblib
			from . import functiontags
			logging.info('training function tag classifier')
			funcclassifier, msg = functiontags.trainfunctionclassifier(
					trees, sents, prm.numproc)
			joblib.dump(funcclassifier, '%s/funcclassifier.pickle' % resultdir,
					compress=3)
			logging.info(msg)
		getgrammars(dobinarization(trees, sents, prm.binarization,
					prm.relationalrealizational),
				sents, prm.stages, prm.testcorpus.maxwords, resultdir,
				prm.numproc, lexmodel, simplelexsmooth, top)
	evalparam = evalmod.readparam(prm.evalparam)
	evalparam['DEBUG'] = -1
	evalparam['CUTOFF_LEN'] = 40
	deletelabel = evalparam.get('DELETE_LABEL', ())
	deleteword = evalparam.get('DELETE_WORD', ())

	begin = time.clock()
	theparser = parser.Parser(prm, funcclassifier=funcclassifier)
	results = doparsing(parser=theparser, testset=testset, resultdir=resultdir,
			usetags=usetags, numproc=prm.numproc, deletelabel=deletelabel,
			deleteword=deleteword, corpusfmt=prm.corpusfmt,
			morphology=prm.morphology, evalparam=evalparam)
	if prm.numproc == 1:
		logging.info('time elapsed during parsing: %gs', time.clock() - begin)
	for result in results:
		nsent = len(result.parsetrees)
		overcutoff = any(len(a) > evalparam['CUTOFF_LEN']
				for a in test_tagged_sents.values())
		header = (' ' + result.name.upper() + ' ').center(
				44 if overcutoff else 35, '=')
		evalsummary = result.evaluator.summary()
		coverage = 'coverage: %s = %6.2f' % (
				('%d / %d' % (nsent - result.noparse, nsent)).rjust(
				25 if overcutoff else 14),
				100.0 * (nsent - result.noparse) / nsent)
		logging.info('\n'.join(('', header, evalsummary, coverage)))
	return top
Пример #33
0
    def get_fields_and_labels_for_all_versions(self,
                                               lang=UNSPECIFIED_TRANSLATION,
                                               group_sep="/",
                                               hierarchy_in_labels=False,
                                               multiple_select="both"):
        """ Return 2 mappings containing field and labels by section

            This is needed because when making an export for several
            versions of the same form, fields get added, removed, and
            edited. Hence we pre-generate mappings containing labels
            and fields for all versions so we can use them later as a
            canvas to keep the export coherent.

            Labels are used as column headers.

            Field are used to create rows of data from submission.
        """

        # TODO: refactor this to use FormPack.get_fields_for_versions

        section_fields = OrderedDict(
        )  # {section: [(name, field), (name...))]}
        section_labels = OrderedDict()  # {section: [field_label, field_label]}
        processed_fields = {}  # Used to avoid expensive lookups

        versions = list(self.versions.values())

        # List of fields we generate ourself to add at the very ends
        # of the field list
        auto_fields = OrderedDict()

        # Create the initial field mappings from the first form version
        for section_name, section in versions[0].sections.items():

            # Field mapping to the section containing them
            section_fields[section_name] = list(section.fields.items())

            # Field labels list mapping to the section containing them
            one_section_labels = section_labels[section_name] = []
            for field in section.fields.values():
                labels = field.get_labels(lang, group_sep, hierarchy_in_labels,
                                          multiple_select)
                one_section_labels.append(labels)

            # Set of processed field names for fast lookup
            field_names = section_fields[section_name]
            processed_fields[section_name] = set(field_names)

            # Append optional additional fields
            auto_field_names = auto_fields[section_name] = []
            if section.children or self.force_index:
                auto_field_names.append('_index')

            if section.parent:
                auto_field_names.append('_parent_table_name')
                auto_field_names.append('_parent_index')

        # Process any new field added in the next versions
        # The hard part is to insert it at a position that makes sense
        for version in versions[1:]:
            for section_name, section in version.sections.items():

                # List of fields and labels we already got for this section
                # from all previous versions
                base_fields_list = section_fields[section_name]
                processed_field_names = processed_fields[section_name]
                base_fields_labels = section_labels[section_name]

                # Potential new fields we want to add
                new_fields = list(section.fields.keys())

                for i, new_field_name in enumerate(new_fields):
                    # Extract the labels for this field, language, group
                    # separator and muliple_select policy
                    labels = field.get_labels(lang, group_sep,
                                              hierarchy_in_labels,
                                              multiple_select)
                    # WARNING, labels is a list of labels for this field
                    # since multiple select answers can span on several columns

                    # We already processed that field and don't need to add it
                    # But we replace the labels for it by the last
                    # version available
                    if new_field_name in processed_field_names:
                        base_labels = enumerate(list(base_fields_labels))
                        for i, _labels in base_labels:
                            if len(_labels) != 2:
                                # e.g. [u'location', u'_location_latitude',...]
                                continue
                            (name, field) = _labels
                            if name == new_field_name:
                                base_fields_labels[i] = labels
                                break
                        continue

                    # If the field appear at the start, append it at the
                    # begining of the lists
                    if i == 0:
                        base_fields_list.insert(0, new_field_name)
                        base_fields_labels.insert(0, labels)
                        continue

                    # For any other field, we need a more advanced position
                    # logic.
                    # We take this new field, and look for all new fields after
                    # it to find the first one that is already in the base
                    # fields. Then we get its index, so we can insert our fresh
                    # new field right before it. This gives us a coherent
                    # order of fields so that they are always, at worst,
                    # adjacent to the last field they used to be to.
                    for following_new_field in new_fields[i + 1:]:
                        if following_new_field in processed_field_names:
                            base_fields = list(base_fields_list)
                            for y, (name, field) in enumerate(base_fields):
                                if name == following_new_field:
                                    base_fields_list.insert(y, new_field)
                                    base_fields_labels.insert(y, labels)
                                    break
                            break
                    else:  # We could not find one, so ad it at the end
                        base_fields_list.append(new_field_name)
                        base_fields_labels.append(labels)

                    processed_field_names.add(new_field_name)

        # Flatten field labels and names. Indeed, field.get_labels()
        # and self.names return a list because a multiple select field can
        # have several values. We needed them grouped to insert them at the
        # proper index, but now we want just list of all of them.

        # Flatten all the names for all the value of all the fields
        for section, fields in list(section_fields.items()):
            name_lists = []
            for _field_data in fields:
                if len(_field_data) != 2:
                    # e.g. [u'location', u'_location_latitude',...]
                    continue
                (field_name, field) = _field_data
                name_lists.append(field.value_names)

            names = [name for name_list in name_lists for name in name_list]

            # add auto fields:
            names.extend(auto_fields[section])

            section_fields[section] = names

        # Flatten all the labels for all the headers of all the fields
        for section, labels in list(section_labels.items()):
            labels = [label for label_group in labels for label in label_group]

            # add auto fields (names and labels are the same)
            labels.extend(auto_fields[section])

            section_labels[section] = labels

        return section_fields, section_labels
Пример #34
0
class AttrTree(object):
    """
    An AttrTree offers convenient, multi-level attribute access for
    collections of objects. AttrTree objects may also be combined
    together using the update method or merge classmethod. Here is an
    example of adding a ViewableElement to an AttrTree and accessing it:

    >>> t = AttrTree()
    >>> t.Example.Path = 1
    >>> t.Example.Path                             #doctest: +ELLIPSIS
    1
    """
    _disabled_prefixes = [] # Underscore attributes that should be
    _sanitizer = util.sanitize_identifier

    @classmethod
    def merge(cls, trees):
        """
        Merge a collection of AttrTree objects.
        """
        first = trees[0]
        for tree in trees:
            first.update(tree)
        return first


    def __dir__(self):
        """
        The _dir_mode may be set to 'default' or 'user' in which case
        only the child nodes added by the user are listed.
        """
        dict_keys = self.__dict__.keys()
        if self.__dict__['_dir_mode'] == 'user':
            return self.__dict__['children']
        else:
            return dir(type(self)) + list(dict_keys)

    def __init__(self, items=None, identifier=None, parent=None, dir_mode='default'):
        """
        identifier: A string identifier for the current node (if any)
        parent:     The parent node (if any)
        items:      Items as (path, value) pairs to construct
                    (sub)tree down to given leaf values.

        Note that the root node does not have a parent and does not
        require an identifier.
        """
        self.__dict__['parent'] = parent
        self.__dict__['identifier'] = type(self)._sanitizer(identifier, escape=False)
        self.__dict__['children'] = []
        self.__dict__['_fixed'] = False
        self.__dict__['_dir_mode'] = dir_mode  # Either 'default' or 'user'

        fixed_error = 'No attribute %r in this AttrTree, and none can be added because fixed=True'
        self.__dict__['_fixed_error'] = fixed_error
        self.__dict__['data'] = OrderedDict()
        items = items.items() if isinstance(items, OrderedDict) else items
        # Python 3
        items = list(items) if items else items
        items = [] if not items else items
        for path, item in items:
            self.set_path(path, item)

    @property
    def path(self):
        "Returns the path up to the root for the current node."
        if self.parent:
            return '.'.join([self.parent.path, str(self.identifier)])
        else:
            return self.identifier if self.identifier else self.__class__.__name__


    @property
    def fixed(self):
        "If fixed, no new paths can be created via attribute access"
        return self.__dict__['_fixed']

    @fixed.setter
    def fixed(self, val):
        self.__dict__['_fixed'] = val


    def update(self, other):
        """
        Updated the contents of the current AttrTree with the
        contents of a second AttrTree.
        """
        if not isinstance(other, AttrTree):
            raise Exception('Can only update with another AttrTree type.')
        fixed_status = (self.fixed, other.fixed)
        (self.fixed, other.fixed) = (False, False)
        for identifier, element in other.items():
            if identifier not in self.data:
                self[identifier] = element
            else:
                self[identifier].update(element)
        (self.fixed, other.fixed) = fixed_status


    def set_path(self, path, val):
        """
        Set the given value at the supplied path where path is either
        a tuple of strings or a string in A.B.C format.
        """
        path = tuple(path.split('.')) if isinstance(path , str) else tuple(path)

        disallowed = [p for p in path if not type(self)._sanitizer.allowable(p)]
        if any(disallowed):
            raise Exception("Attribute strings in path elements cannot be "
                            "correctly escaped : %s" % ','.join(repr(el) for el in disallowed))
        if len(path) > 1:
            attrtree = self.__getattr__(path[0])
            attrtree.set_path(path[1:], val)
        else:
            self.__setattr__(path[0], val)


    def filter(self, path_filters):
        """
        Filters the loaded AttrTree using the supplied path_filters.
        """
        if not path_filters: return self

        # Convert string path filters
        path_filters = [tuple(pf.split('.')) if not isinstance(pf, tuple)
                        else pf for pf in path_filters]

        # Search for substring matches between paths and path filters
        new_attrtree = self.__class__()
        for path, item in self.data.items():
            if any([all([subpath in path for subpath in pf]) for pf in path_filters]):
                new_attrtree.set_path(path, item)

        return new_attrtree


    def _propagate(self, path, val):
        """
        Propagate the value up to the root node.
        """
        if val == '_DELETE':
            if path in self.data:
                del self.data[path]
            else:
                items = [(key, v) for key, v in self.data.items()
                         if not all(k==p for k, p in zip(key, path))]
                self.data = OrderedDict(items)
        else:
            self.data[path] = val
        if self.parent is not None:
            self.parent._propagate((self.identifier,)+path, val)


    def __setitem__(self, identifier, val):
        """
        Set a value at a child node with given identifier. If at a root
        node, multi-level path specifications is allowed (i.e. 'A.B.C'
        format or tuple format) in which case the behaviour matches
        that of set_path.
        """
        if isinstance(identifier, str) and '.' not in identifier:
            self.__setattr__(identifier, val)
        elif isinstance(identifier, str) and self.parent is None:
            self.set_path(tuple(identifier.split('.')), val)
        elif isinstance(identifier, tuple) and self.parent is None:
            self.set_path(identifier, val)
        else:
            raise Exception("Multi-level item setting only allowed from root node.")


    def __getitem__(self, identifier):
        """
        For a given non-root node, access a child element by identifier.

        If the node is a root node, you may also access elements using
        either tuple format or the 'A.B.C' string format.
        """
        split_label = (tuple(identifier.split('.'))
                       if isinstance(identifier, str) else tuple(identifier))
        if len(split_label) == 1:
            identifier = split_label[0]
            if identifier in self.children:
                return self.__dict__[identifier]
            else:
                raise KeyError(identifier)
        path_item = self
        for identifier in split_label:
            path_item = path_item[identifier]
        return path_item


    def __delitem__(self, identifier):
        split_label = (tuple(identifier.split('.'))
                       if isinstance(identifier, str) else tuple(identifier))
        if len(split_label) == 1:
            identifier = split_label[0]
            if identifier in self.children:
                del self.__dict__[identifier]
                self.children.pop(self.children.index(identifier))
            else:
                raise KeyError(identifier)
            self._propagate(split_label, '_DELETE')
        else:
            path_item = self
            for i, identifier in enumerate(split_label[:-1]):
                path_item = path_item[identifier]
            del path_item[split_label[-1]]


    def __setattr__(self, identifier, val):
        # Getattr is skipped for root and first set of children
        shallow = (self.parent is None or self.parent.parent is None)

        if util.tree_attribute(identifier) and self.fixed and shallow:
            raise AttributeError(self._fixed_error % identifier)

        super(AttrTree, self).__setattr__(identifier, val)

        if util.tree_attribute(identifier):
            if not identifier in self.children:
                self.children.append(identifier)
            self._propagate((identifier,), val)


    def __getattr__(self, identifier):
        """
        Access a identifier from the AttrTree or generate a new AttrTree
        with the chosen attribute path.
        """
        try:
            return super(AttrTree, self).__getattr__(identifier)
        except AttributeError: pass

        # Attributes starting with __ get name mangled
        if identifier.startswith('_' + type(self).__name__) or identifier.startswith('__'):
            raise AttributeError('Attribute %s not found.' % identifier)
        elif self.fixed==True:
            raise AttributeError(self._fixed_error % identifier)


        if not any(identifier.startswith(prefix)
                   for prefix in type(self)._disabled_prefixes):
            sanitized = type(self)._sanitizer(identifier, escape=False)
        else:
            sanitized = identifier

        if sanitized in self.children:
            return self.__dict__[sanitized]


        if not sanitized.startswith('_') and util.tree_attribute(identifier):
            self.children.append(sanitized)
            dir_mode = self.__dict__['_dir_mode']
            child_tree = self.__class__(identifier=sanitized,
                                        parent=self, dir_mode=dir_mode)
            self.__dict__[sanitized] = child_tree
            return child_tree
        else:
            raise AttributeError('%r object has no attribute %s.' %
                                 (type(self).__name__, identifier))


    def __iter__(self):
        return iter(self.data.values())


    def __contains__(self, name):
        return name in self.children or name in self.data


    def __len__(self):
        return len(self.data)


    def get(self, identifier, default=None):
        """Get a node of the AttrTree using its path string.

        Args:
            identifier: Path string of the node to return
            default: Value to return if no node is found

        Returns:
            The indexed node of the AttrTree
        """
        split_label = (tuple(identifier.split('.'))
                       if isinstance(identifier, str) else tuple(identifier))
        if len(split_label) == 1:
            identifier = split_label[0]
            return self.__dict__.get(identifier, default)
        path_item = self
        for identifier in split_label:
            if path_item == default or path_item is None:
                return default
            path_item = path_item.get(identifier, default)
        return path_item

    def keys(self):
        "Keys of nodes in the AttrTree"
        return list(self.data.keys())


    def items(self):
        "Keys and nodes of the AttrTree"
        return list(self.data.items())


    def values(self):
        "Nodes of the AttrTree"
        return list(self.data.values())


    def pop(self, identifier, default=None):
        """Pop a node of the AttrTree using its path string.

        Args:
            identifier: Path string of the node to return
            default: Value to return if no node is found

        Returns:
            The node that was removed from the AttrTree
        """
        if identifier in self.children:
            item = self[identifier]
            self.__delitem__(identifier)
            return item
        else:
            return default


    def __repr__(self):
        return PrettyPrinter.pprint(self)
Пример #35
0
def externaltagging(usetagger, model, sents, overridetag, tagmap):
	"""Use an external tool to tag a list of sentences."""
	logging.info('Start tagging.')
	goldtags = [t for sent in sents.values() for _, t in sent]
	if usetagger == 'treetagger':  # Tree-tagger
		if not os.path.exists('tree-tagger/bin/tree-tagger'):
			raise ValueError(TREETAGGERHELP)
		infile, inname = tempfile.mkstemp(text=True)
		with os.fdopen(infile, 'w') as infile:
			for tagsent in sents.values():
				sent = map(itemgetter(0), tagsent)
				infile.write('\n'.join(w.encode('utf-8')
					for w in sent) + '\n<S>\n')
		filtertags = ''
		if not model:
			model = 'tree-tagger/lib/german-par-linux-3.2-utf8.bin'
			filtertags = '| tree-tagger/cmd/filter-german-tags'
		tagger = Popen('tree-tagger/bin/tree-tagger -token -sgml'
				' %s %s %s' % (model, inname, filtertags),
				stdout=PIPE, shell=True)
		tagout = tagger.stdout.read(
				).decode('utf-8').split('<S>')[:-1]
		os.unlink(inname)
		taggedsents = OrderedDict((n, [tagmangle(a, None, overridetag, tagmap)
					for a in tags.splitlines() if a.strip()])
					for n, tags in zip(sents, tagout))
	elif usetagger == 'stanford':  # Stanford Tagger
		if not os.path.exists('stanford-postagger-full-2012-07-09'):
			raise ValueError(STANFORDTAGGERHELP)
		infile, inname = tempfile.mkstemp(text=True)
		with os.fdopen(infile, 'w') as infile:
			for tagsent in sents.values():
				sent = map(itemgetter(0), tagsent)
				infile.write(' '.join(w.encode('utf-8')
					for w in sent) + '\n')
		if not model:
			model = 'models/german-hgc.tagger'
		tagger = Popen(args=(
				'/usr/bin/java -mx2G -classpath stanford-postagger.jar'
				' edu.stanford.nlp.tagger.maxent.MaxentTagger'
				' -tokenize false -encoding utf-8'
				' -model %s -textFile %s' % (model, inname)).split(),
				cwd='stanford-postagger-full-2012-07-09',
				shell=False, stdout=PIPE)
		tagout = tagger.stdout.read(
				).decode('utf-8').splitlines()
		os.unlink(inname)
		taggedsents = OrderedDict((n, [tagmangle(a, '_', overridetag, tagmap)
			for a in tags.split()]) for n, tags in zip(sents, tagout))
	elif usetagger == 'frog':  # Dutch 'frog' tagger
		tagger = Popen(args=[which('frog')] +
					'-n --skip=tacmnp -t /dev/stdin'.split(),
				shell=False, stdin=PIPE, stdout=PIPE)
		tagout, stderr = tagger.communicate(''.join(
				' '.join(w for w in map(itemgetter(0), tagsent)) + '\n'
				for tagsent in sents.values()).encode('utf8'))
		logging.info(stderr)
		# lines consist of: 'idx token lemma POS score'
		taggedsents = OrderedDict((n,
				[(line.split()[1],
					line.split()[3].replace('(', '[').replace(')', ']'))
					for line in lines.splitlines()]) for n, lines
				in zip(sents, tagout.decode('utf-8').split('\n\n')))
	if len(taggedsents) != len(sents):
		raise ValueError('mismatch in number of sentences after tagging.')
	for n, tags in taggedsents.items():
		if len(sents[n]) != len(tags):
			raise ValueError('mismatch in number of tokens after tagging.\n'
				'before: %r\nafter: %r' % (sents[n], tags))
	newtags = [t for sent in taggedsents.values() for _, t in sent]
	logging.info('Tag accuracy: %5.2f\ngold - cand: %r\ncand - gold %r',
		(100 * accuracy(goldtags, newtags)),
		set(goldtags) - set(newtags), set(newtags) - set(goldtags))
	return taggedsents
Пример #36
0
class CorpusReader(object):
	"""Abstract corpus reader."""

	def __init__(self, path, encoding='utf8', ensureroot=None, punct=None,
			headrules=None, removeempty=False,
			functions=None, morphology=None, lemmas=None):
		"""
		:param path: filename or pattern of corpus files; e.g., ``wsj*.mrg``.
		:param ensureroot: add root node with given label if necessary.
		:param removeempty: remove empty nodes and any empty ancestors; a
			terminal is empty if it is equal to None, '', or '-NONE-'.
		:param headrules: if given, read rules for assigning heads and apply
			them by ordering constituents according to their heads.
		:param punct: one of ...

			:None: leave punctuation as is [default].
			:'move': move punctuation to appropriate constituents
					using heuristics.
			:'moveall': same as 'move', but moves all preterminals under root,
					instead of only recognized punctuation.
			:'prune': prune away leading & ending quotes & periods, then move.
			:'remove': eliminate punctuation.
			:'removeall': eliminate all preterminals directly under root.
			:'root': attach punctuation directly to root
					(as in original Negra/Tiger treebanks).
		:param functions: one of ...

			:None, 'leave': leave syntactic labels as is [default].
			:'add': concatenate grammatical function to syntactic label,
				separated by a hypen: e.g., ``NP => NP-SBJ``.
			:'remove': strip away hyphen-separated grammatical function,
				e.g., ``NP-SBJ => NP``.
			:'replace': replace syntactic label with grammatical function,
				e.g., ``NP => SBJ``.
		:param morphology: one of ...

			:None, 'no': use POS tags as preterminals [default].
			:'add': concatenate morphological information to POS tags,
				e.g., ``DET/sg.def``.
			:'replace': use morphological information as preterminal label
			:'between': add node with morphological information between
				POS tag and word, e.g., ``(DET (sg.def the))``.
		:param lemmas: one of ...

			:None: ignore lemmas [default].
			:'add': concatenate lemma to terminals, e.g., men/man.
			:'replace': use lemmas as terminals.
			:'between': insert lemma as node between POS tag and word."""
		self.removeempty = removeempty
		self.ensureroot = ensureroot
		self.functions = functions
		self.punct = punct
		self.morphology = morphology
		self.lemmas = lemmas
		self.headrules = readheadrules(headrules) if headrules else {}
		self._encoding = encoding
		try:
			self._filenames = (sorted(glob(path), key=numbase)
					if path != '-' else ['-'])
		except TypeError:
			print('all sentence IDs must have the same type signature '
					'(number, string)')
			raise
		for opts, opt in (
				((None, 'leave', 'add', 'replace', 'remove', 'between'),
					functions),
				((None, 'no', 'add', 'replace', 'between'), morphology),
				((None, 'no', 'move', 'moveall', 'remove', 'removeall',
					'prune', 'root'), punct),
				((None, 'no', 'add', 'replace', 'between'), lemmas),
				):
			if opt not in opts:
				raise ValueError('Expected one of %r. Got: %r' % (opts, opt))
		if not self._filenames:
			raise ValueError("no files matched pattern '%s' in %s" % (
					path, os.getcwd()))
		self._block_cache = None
		self._trees_cache = None

	def itertrees(self, start=None, end=None):
		"""
		:returns: an iterator returning tuples ``(key, item)``
			of sentences in corpus, where ``item`` is an :py:class:Item
			instance with ``tree``, ``sent``, and ``comment`` attributes.
			Useful when the dictionary of all trees in corpus would not fit in
			memory."""
		for n, a in islice(self._read_blocks(), start, end):
			yield n, self._parsetree(a)

	def trees(self):
		"""
		:returns: an ordered dictionary of parse trees
			(``Tree`` objects with integer indices as leaves)."""
		if not self._trees_cache:
			self._trees_cache = OrderedDict((n, self._parsetree(a))
					for n, a in self._read_blocks())
		return OrderedDict((n, a.tree) for n, a in self._trees_cache.items())

	def sents(self):
		"""
		:returns: an ordered dictionary of sentences,
			each sentence being a list of words."""
		if not self._trees_cache:
			self._trees_cache = OrderedDict((n, self._parsetree(a))
					for n, a in self._read_blocks())
		return OrderedDict((n, a.sent) for n, a in self._trees_cache.items())

	def tagged_sents(self):
		"""
		:returns: an ordered dictionary of tagged sentences,
			each tagged sentence being a list of (word, tag) pairs."""
		if not self._trees_cache:
			self._trees_cache = OrderedDict((n, self._parsetree(a))
					for n, a in self._read_blocks())
		return OrderedDict(
				(n, [(w, t) for w, (_, t) in zip(a.sent, sorted(a.tree.pos()))])
				for n, a in self._trees_cache.items())

	def blocks(self):
		"""
		:returns: a list of strings containing the raw representation of
			trees in the original treebank."""

	def _read_blocks(self):
		"""Iterate over blocks in corpus file corresponding to parse trees."""

	def _parse(self, block):
		""":returns: a parse tree given a string from the treebank file."""

	def _parsetree(self, block):
		""":returns: a transformed parse tree and sentence."""
		item = self._parse(block)
		if not item.sent:  # ??3
			return item
		if self.removeempty:
			removeemptynodes(item.tree, item.sent)
		if self.ensureroot and item.tree.label != self.ensureroot:
			item.tree = ParentedTree(self.ensureroot, [item.tree])
		if not isinstance(self, BracketCorpusReader):
			# roughly order constituents by order in sentence
			for a in reversed(list(item.tree.subtrees(lambda x: len(x) > 1))):
				a.children.sort(key=Tree.leaves)
		if self.punct:
			applypunct(self.punct, item.tree, item.sent)
		if self.headrules:
			applyheadrules(item.tree, self.headrules)
		return item

	def _word(self, block):
		""":returns: a list of words given a string."""
		if self.punct in {'remove', 'prune'}:
			return self._parsetree(block).sent
		return self._parse(block).sent
Пример #37
0
 def test_setitem(self):
     od = OrderedDict([('d', 1), ('b', 2), ('c', 3), ('a', 4), ('e', 5)])
     od['c'] = 10           # existing element
     od['f'] = 20           # new element
     self.assertEqual(list(od.items()),
                      [('d', 1), ('b', 2), ('c', 10), ('a', 4), ('e', 5), ('f', 20)])
Пример #38
0
def startexp(
        prm,  # A DictObj with the structure of parser.DEFAULTS
        resultdir='results',
        rerun=False):
    """Execute an experiment."""
    if rerun:
        if not os.path.exists(resultdir):
            raise ValueError('Directory %r does not exist.\n--rerun requires a'
                             ' directory with the grammar(s) of a previous experiment.'
                             % resultdir)
    else:
        if os.path.exists(resultdir):
            raise ValueError('Directory %r exists.\n'
                             'Use --rerun to parse with existing grammar '
                             'and overwrite previous results.' % resultdir)
        os.mkdir(resultdir)

    # Log everything, and send it to stderr, in a format with just the message.
    formatstr = '%(message)s'
    if prm.verbosity == 0:
        logging.basicConfig(level=logging.WARNING, format=formatstr)
    elif prm.verbosity == 1:
        logging.basicConfig(level=logging.INFO, format=formatstr)
    elif prm.verbosity == 2:
        logging.basicConfig(level=logging.DEBUG, format=formatstr)
    elif 3 <= prm.verbosity <= 4:
        logging.basicConfig(level=5, format=formatstr)
    else:
        raise ValueError('verbosity should be >= 0 and <= 4. ')

    # also log to a file
    fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
    fileobj.setLevel(logging.DEBUG)
    fileobj.setFormatter(logging.Formatter(formatstr))
    logging.getLogger('').addHandler(fileobj)
    logging.info('Disco-DOP %s, running on Python %s',
                 __version__, sys.version.split()[0])
    if not rerun:
        trees, sents, train_tagged_sents = loadtraincorpus(
            prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
            prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
            prm.transformations, prm.relationalrealizational)
    elif isinstance(prm.traincorpus.numsents, float):
        raise ValueError('need to specify number of training set sentences, '
                         'not fraction, in rerun mode.')

    testsettb = treebank.READERS[prm.corpusfmt](
        prm.testcorpus.path, encoding=prm.testcorpus.encoding,
        headrules=prm.binarization.headrules,
        removeempty=prm.removeempty, morphology=prm.morphology,
        functions=prm.functions, ensureroot=prm.ensureroot)
    if isinstance(prm.testcorpus.numsents, float):
        prm.testcorpus.numsents = int(prm.testcorpus.numsents
                                      * len(testsettb.blocks()))
    if prm.testcorpus.skiptrain:
        prm.testcorpus.skip += (  # pylint: disable=maybe-no-member
            prm.traincorpus.numsents)  # pylint: disable=maybe-no-member

    test_blocks = OrderedDict()
    test_trees = OrderedDict()
    test_tagged_sents = OrderedDict()
    for n, item in testsettb.itertrees(
            prm.testcorpus.skip,
            prm.testcorpus.skip  # pylint: disable=no-member
            + prm.testcorpus.numsents):
        if 1 <= len(item.sent) <= prm.testcorpus.maxwords:
            test_blocks[n] = item.block
            test_trees[n] = item.tree
            test_tagged_sents[n] = [(word, tag) for word, (_, tag)
                                    in zip(item.sent, sorted(item.tree.pos()))]
    logging.info('%d test sentences after length restriction <= %d',
                 len(test_trees), prm.testcorpus.maxwords)
    lexmodel = None
    simplelexsmooth = False
    test_tagged_sents_mangled = test_tagged_sents
    if prm.postagging and prm.postagging.method in (
            'treetagger', 'stanford', 'frog'):
        if prm.postagging.method == 'treetagger':
            # these two tags are never given by tree-tagger,
            # so collect words whose tag needs to be overriden
            overridetags = ('PTKANT', 'PIDAT')
        elif prm.postagging.method == 'stanford':
            overridetags = ('PTKANT', )
        elif prm.postagging.method == 'frog':
            overridetags = ()
        taglex = defaultdict(set)
        for sent in train_tagged_sents:
            for word, tag in sent:
                taglex[word].add(tag)
        overridetagdict = {tag:
                           {word for word, tags in taglex.items() if tags == {tag}}
                           for tag in overridetags}
        tagmap = {'$(': '$[', 'PAV': 'PROAV'}
        test_tagged_sents_mangled = lexicon.externaltagging(
            prm.postagging.method, prm.postagging.model, test_tagged_sents,
            overridetagdict, tagmap)
        if prm.postagging.retag and not rerun:
            logging.info('re-tagging training corpus')
            sents_to_tag = OrderedDict(enumerate(train_tagged_sents))
            train_tagged_sents = lexicon.externaltagging(prm.postagging.method,
                                                         prm.postagging.model, sents_to_tag, overridetagdict,
                                                         tagmap).values()
            for tree, tagged in zip(trees, train_tagged_sents):
                for node in tree.subtrees(
                        lambda n: len(n) == 1 and isinstance(n[0], int)):
                    node.label = tagged[node[0]][1]
        usetags = True  # give these tags to parser
    elif prm.postagging and prm.postagging.method == 'unknownword':
        if not rerun:
            sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
            simplelexsmooth = prm.postagging.simplelexsmooth
        usetags = False  # make sure gold POS tags are not given to parser
    else:
        usetags = True  # give gold POS tags to parser

    # 0: test sentences as they should be handed to the parser,
    # 1: gold trees for evaluation purposes
    # 2: gold sents because test sentences may be mangled by unknown word model
    # 3: blocks from treebank file to reproduce the relevant part of the
    #   original treebank verbatim.
    testset = OrderedDict((n, (
        test_tagged_sents_mangled[n],
        test_trees[n],
        test_tagged_sents[n],
        block))
        for n, block in test_blocks.items())
    if not test_tagged_sents:
        raise ValueError('test corpus (selection) should be non-empty.')

    if rerun:
        trees, sents = [], []
    roots = {t.label for t in trees} | {test_trees[n].label for n in testset}
    if len(roots) != 1:
        raise ValueError('expected unique ROOT label: %r' % roots)
    top = roots.pop()
    funcclassifier = None

    if rerun:
        parser.readgrammars(resultdir, prm.stages, prm.postagging, top)
        if prm.predictfunctions:
            from sklearn.externals import joblib
            funcclassifier = joblib.load('%s/funcclassifier.pickle' % resultdir)
    else:
        logging.info('read training & test corpus')
        if prm.predictfunctions:
            from sklearn.externals import joblib
            from . import functiontags
            logging.info('training function tag classifier')
            funcclassifier, msg = functiontags.trainfunctionclassifier(
                trees, sents, prm.numproc)
            joblib.dump(funcclassifier, '%s/funcclassifier.pickle' % resultdir,
                        compress=3)
            logging.info(msg)
        getgrammars(dobinarization(trees, sents, prm.binarization,
                                   prm.relationalrealizational),
                    sents, prm.stages, prm.testcorpus.maxwords, resultdir,
                    prm.numproc, lexmodel, simplelexsmooth, top)
    evalparam = evalmod.readparam(prm.evalparam)
    evalparam['DEBUG'] = -1
    evalparam['CUTOFF_LEN'] = 40
    deletelabel = evalparam.get('DELETE_LABEL', ())
    deleteword = evalparam.get('DELETE_WORD', ())

    begin = time.clock()
    theparser = parser.Parser(prm, funcclassifier=funcclassifier)
    results = doparsing(parser=theparser, testset=testset, resultdir=resultdir,
                        usetags=usetags, numproc=prm.numproc, deletelabel=deletelabel,
                        deleteword=deleteword, corpusfmt=prm.corpusfmt,
                        morphology=prm.morphology, evalparam=evalparam)
    if prm.numproc == 1:
        logging.info('time elapsed during parsing: %gs', time.clock() - begin)
    for result in results:
        nsent = len(result.parsetrees)
        overcutoff = any(len(a) > evalparam['CUTOFF_LEN']
                         for a in test_tagged_sents.values())
        header = (' ' + result.name.upper() + ' ').center(
            44 if overcutoff else 35, '=')
        evalsummary = result.evaluator.summary()
        coverage = 'coverage: %s = %6.2f' % (
            ('%d / %d' % (nsent - result.noparse, nsent)).rjust(
                25 if overcutoff else 14),
            100.0 * (nsent - result.noparse) / nsent)
        logging.info('\n'.join(('', header, evalsummary, coverage)))
    return top
Пример #39
0
class CorpusReader(object):
    """Abstract corpus reader."""

    def __init__(self, path, encoding='utf8', ensureroot=None, punct=None,
                 headrules=None, removeempty=False,
                 functions=None, morphology=None, lemmas=None):
        """
        :param path: filename or pattern of corpus files; e.g., ``wsj*.mrg``.
        :param ensureroot: add root node with given label if necessary.
        :param removeempty: remove empty nodes and any empty ancestors; a
                terminal is empty if it is equal to None, '', or '-NONE-'.
        :param headrules: if given, read rules for assigning heads and apply
                them by ordering constituents according to their heads.
        :param punct: one of ...

                :None: leave punctuation as is [default].
                :'move': move punctuation to appropriate constituents
                                using heuristics.
                :'moveall': same as 'move', but moves all preterminals under root,
                                instead of only recognized punctuation.
                :'prune': prune away leading & ending quotes & periods, then move.
                :'remove': eliminate punctuation.
                :'removeall': eliminate all preterminals directly under root.
                :'root': attach punctuation directly to root
                                (as in original Negra/Tiger treebanks).
        :param functions: one of ...

                :None, 'leave': leave syntactic labels as is [default].
                :'add': concatenate grammatical function to syntactic label,
                        separated by a hypen: e.g., ``NP => NP-SBJ``.
                :'remove': strip away hyphen-separated grammatical function,
                        e.g., ``NP-SBJ => NP``.
                :'replace': replace syntactic label with grammatical function,
                        e.g., ``NP => SBJ``.
        :param morphology: one of ...

                :None, 'no': use POS tags as preterminals [default].
                :'add': concatenate morphological information to POS tags,
                        e.g., ``DET/sg.def``.
                :'replace': use morphological information as preterminal label
                :'between': add node with morphological information between
                        POS tag and word, e.g., ``(DET (sg.def the))``.
        :param lemmas: one of ...

                :None: ignore lemmas [default].
                :'add': concatenate lemma to terminals, e.g., men/man.
                :'replace': use lemmas as terminals.
                :'between': insert lemma as node between POS tag and word."""
        self.removeempty = removeempty
        self.ensureroot = ensureroot
        self.functions = functions
        self.punct = punct
        self.morphology = morphology
        self.lemmas = lemmas
        self.headrules = readheadrules(headrules) if headrules else {}
        self._encoding = encoding
        try:
            self._filenames = (sorted(glob(path), key=numbase)
                               if path != '-' else ['-'])
        except TypeError:
            print('all sentence IDs must have the same type signature '
                  '(number, string)')
            raise
        for opts, opt in (
            ((None, 'leave', 'add', 'replace', 'remove', 'between'),
             functions),
            ((None, 'no', 'add', 'replace', 'between'), morphology),
            ((None, 'no', 'move', 'moveall', 'remove', 'removeall',
              'prune', 'root'), punct),
            ((None, 'no', 'add', 'replace', 'between'), lemmas),
        ):
            if opt not in opts:
                raise ValueError('Expected one of %r. Got: %r' % (opts, opt))
        if not self._filenames:
            raise ValueError("no files matched pattern '%s' in %s" % (
                path, os.getcwd()))
        self._block_cache = None
        self._trees_cache = None

    def itertrees(self, start=None, end=None):
        """
        :returns: an iterator returning tuples ``(key, item)``
                of sentences in corpus, where ``item`` is an :py:class:Item
                instance with ``tree``, ``sent``, and ``comment`` attributes.
                Useful when the dictionary of all trees in corpus would not fit in
                memory."""
        for n, a in islice(self._read_blocks(), start, end):
            yield n, self._parsetree(a)

    def trees(self):
        """
        :returns: an ordered dictionary of parse trees
                (``Tree`` objects with integer indices as leaves)."""
        if not self._trees_cache:
            self._trees_cache = OrderedDict((n, self._parsetree(a))
                                            for n, a in self._read_blocks())
        return OrderedDict((n, a.tree) for n, a in self._trees_cache.items())

    def sents(self):
        """
        :returns: an ordered dictionary of sentences,
                each sentence being a list of words."""
        if not self._trees_cache:
            self._trees_cache = OrderedDict((n, self._parsetree(a))
                                            for n, a in self._read_blocks())
        return OrderedDict((n, a.sent) for n, a in self._trees_cache.items())

    def tagged_sents(self):
        """
        :returns: an ordered dictionary of tagged sentences,
                each tagged sentence being a list of (word, tag) pairs."""
        if not self._trees_cache:
            self._trees_cache = OrderedDict((n, self._parsetree(a))
                                            for n, a in self._read_blocks())
        return OrderedDict(
            (n, [(w, t) for w, (_, t) in zip(a.sent, sorted(a.tree.pos()))])
            for n, a in self._trees_cache.items())

    def blocks(self):
        """
        :returns: a list of strings containing the raw representation of
                trees in the original treebank."""

    def _read_blocks(self):
        """Iterate over blocks in corpus file corresponding to parse trees."""

    def _parse(self, block):
        """:returns: a parse tree given a string from the treebank file."""

    def _parsetree(self, block):
        """:returns: a transformed parse tree and sentence."""
        item = self._parse(block)
        if not item.sent:  # ??3
            return item
        if self.removeempty:
            removeemptynodes(item.tree, item.sent)
        if self.ensureroot and item.tree.label != self.ensureroot:
            item.tree = ParentedTree(self.ensureroot, [item.tree])
        if not isinstance(self, BracketCorpusReader):
            # roughly order constituents by order in sentence
            for a in reversed(list(item.tree.subtrees(lambda x: len(x) > 1))):
                a.children.sort(key=Tree.leaves)
        if self.punct:
            applypunct(self.punct, item.tree, item.sent)
        if self.headrules:
            applyheadrules(item.tree, self.headrules)
        return item

    def _word(self, block):
        """:returns: a list of words given a string."""
        if self.punct in {'remove', 'prune'}:
            return self._parsetree(block).sent
        return self._parse(block).sent
Пример #40
0
class FormVersion(object):
    @classmethod
    def verify_schema_structure(cls, struct):
        if 'content' not in struct:
            raise SchemaError('version content must have "content"')
        if 'survey' not in struct['content']:
            raise SchemaError('version content must have "survey"')
        validate_content(struct['content'])

    # QUESTION FOR ALEX: get rid off _root_node_name ? What is it for ?
    def __init__(self, form_pack, schema):

        # QUESTION FOR ALEX: why this check ?
        if 'name' in schema:
            raise ValueError('FormVersion should not have a name parameter. '
                             'consider using "title" or "id_string"')
        self.schema = schema
        self.form_pack = form_pack

        # slug of title
        self.root_node_name = self._get_root_node_name()

        # form version id, unique to this version of the form
        self.id = schema.get('version')
        self.version_id_key = schema.get('version_id_key',
                                         form_pack.default_version_id_key)

        # form string id, unique to this form, shared accross versions
        self.id_string = schema.get('id_string')

        # TODO: set the title of the last version as the name of the first
        # section ?
        # Human readable title for this version
        self.title = schema.get('title', form_pack.title)

        # List of available language for translation. One translation does
        # not mean all labels are translated, but at least one.
        # One special translation not listed here is "_default", which
        # use either the only label available, or the field name.
        # This will be converted down the line to a list. We use an OrderedDict
        # to maintain order and remove duplicates, but will need indexing later
        self.translations = OrderedDict()

        # Sections separates fields from various level of nesting in case
        # we have repeat group. If you don't have repeat group, you have
        # only one section, if you have repeat groups, you will have one
        # section per repeat group. Sections eventually become sheets in
        # xls export.
        self.sections = OrderedDict()

        content = self.schema['content']

        self.translations = map(lambda t: t if t is not None else UNTRANSLATED,
                                content.get('translations', [None]))

        # TODO: put those parts in a separate method and unit test it
        survey = content.get('survey', [])
        fields_by_name = dict(map(lambda row: (row.get('name'), row), survey))

        # Analyze the survey schema and extract the informations we need
        # to build the export: the sections, the choices, the fields
        # and translations for each of them.

        # Extract choices data.
        # Choices are the list of values you can choose from to answer a
        # specific question. They can have translatable labels.
        choices_definition = content.get('choices', ())
        field_choices = FormChoice.all_from_json_definition(
            choices_definition, self.translations)

        # Extract fields data
        group = None
        section = FormSection(name=form_pack.title)
        self.sections[form_pack.title] = section

        # Those will keep track of were we are while traversing the
        # schema.
        # Hierarchy contains all the levels, mixing groups and sections,
        # including the first and last ones while stacks are just an history of
        # previous levels, and for either groups or sections.
        hierarchy = [section]
        group_stack = []
        section_stack = []

        for data_definition in survey:
            data_type = data_definition.get('type')
            if not data_type:  # handle broken data type definition
                continue

            data_type = normalize_data_type(data_type)
            name = data_definition.get('name')

            # parse closing groups and repeat
            if data_type is None:
                continue

            if data_type == 'end_group':
                # We go up in one level of nesting, so we set the current group
                # to be what used to be the parent group. We also remote one
                # level in the hierarchy.
                hierarchy.pop()
                group = group_stack.pop()
                continue

            if data_type == 'end_repeat':
                # We go up in one level of nesting, so we set the current section
                # to be what used to be the parent section
                hierarchy.pop()
                section = section_stack.pop()
                continue

            # parse defintinitions of stuff having a name such as fields
            # or opening groups and repeats
            if name is None:
                continue

            if data_type == 'begin_group':
                group_stack.append(group)
                group = FormGroup.from_json_definition(
                    data_definition,
                    translations=self.translations,
                )
                # We go down in one level on nesting, so save the parent group.
                # Parent maybe None, in that case we are at the top level.
                hierarchy.append(group)
                continue

            if data_type == 'begin_repeat':
                # We go down in one level on nesting, so save the parent section.
                # Parent maybe None, in that case we are at the top level.
                parent_section = section

                section = FormSection.from_json_definition(
                    data_definition,
                    hierarchy,
                    parent=parent_section,
                    translations=self.translations,
                )
                self.sections[section.name] = section
                hierarchy.append(section)
                section_stack.append(parent_section)
                parent_section.children.append(section)
                continue

            # If we are here, it's a regular field
            # Get the the data name and type
            field = FormField.from_json_definition(
                data_definition,
                hierarchy,
                section,
                field_choices,
                translations=self.translations)
            section.fields[field.name] = field

            _f = fields_by_name[field.name]
            _labels = LabelStruct()

            if 'label' in _f:
                if not isinstance(_f['label'], list):
                    _f['label'] = [_f['label']]
                _labels = LabelStruct(labels=_f['label'],
                                      translations=self.translations)

            field.labels = _labels
            assert 'labels' not in _f

    # FIXME: Find a safe way to use this. Wrapping with try/except isn't enough
    # to fix https://github.com/kobotoolbox/formpack/issues/150
    #
    #def __repr__(self):
    #    return '<FormVersion %s>' % self._stats()

    def _stats(self):
        _stats = OrderedDict()
        _stats['id_string'] = self._get_id_string()
        _stats['version'] = self.id
        _stats['row_count'] = len(
            self.schema.get('content', {}).get('survey', []))
        # returns stats in the format [ key="value" ]
        return '\n\t'.join(
            map(lambda key: '%s="%s"' % (key, str(_stats[key])),
                _stats.keys()))

    def to_dict(self, **opts):
        return flatten_content(self.schema['content'], **opts)

    # TODO: find where to move that
    def _load_submission_xml(self, xml):
        raise NotImplementedError("This doesn't work now that submissions "
                                  "are out of the class. Port it to Export.")
        _xmljson = parse_xml_to_xmljson(xml)
        _rootatts = _xmljson.get('attributes', {})
        _id_string = _rootatts.get('id_string')
        _version_id = _rootatts.get('version')
        if _id_string != self._get_id_string():
            raise ValueError('submission id_string does not match: %s != %s' %
                             (self._get_id_string(), _id_string))
        if _version_id != self.form_pack.id_string:
            raise ValueError('mismatching version id %s != %s' %
                             (self.form_pack.id_string, _version_id))
        self.submissions.append(FormSubmission.from_xml(_xmljson, self))

    def lookup(self, prop, default=None):
        result = getattr(self, prop, None)
        if result is None:
            result = self.form_pack.lookup(prop, default=default)
        return result

    def _get_root_node_name(self):
        return self.lookup('root_node_name', default='data')

    def _get_id_string(self):
        return self.lookup('id_string')

    def _get_title(self):
        '''
        if formversion has no name, uses form's name
        '''
        if self.title is None:
            return self.form_pack.title
        return self.title

    def get_labels(self, lang=UNTRANSLATED, group_sep=None):
        """ Returns a mapping of labels for {section: [field_label, ...]...}

            Sections and fields labels can be set to use their slug name,
            their lone label, or one of the translated labels.

            If a field is part of a group and a group separator is passed,
            the group label is retrieved, possibly translated, and
            prepended to the field label itself.
        """

        all_labels = OrderedDict()
        for section_name, section in self.sections.items():

            section_label = section.labels.get(lang) or section_name
            section_labels = all_labels[section_label] = []

            for field_name, field in section.fields.items():
                section_labels.extend(field.get_labels(lang, group_sep))

        return all_labels

    def to_xml(self, warnings=None):
        # todo: collect warnings from pyxform compilation when a list is passed
        survey = formversion_pyxform(
            self.to_dict(remove_sheets=['translations', 'translated'], ))
        title = self._get_title()

        if title is None:
            raise ValueError('cannot create xml on a survey with no title.')

        survey.update({
            'name': self.lookup('root_node_name', 'data'),
            'id_string': self.lookup('id_string'),
            'title': self.lookup('title'),
            'version': self.lookup('id'),
        })
        return survey._to_pretty_xml().encode('utf-8')
Пример #41
0
    def get_fields_and_labels_for_all_versions(self, lang=UNSPECIFIED_TRANSLATION, group_sep="/",
                                                hierarchy_in_labels=False,
                                                multiple_select="both"):
        """ Return 2 mappings containing field and labels by section

            This is needed because when making an export for several
            versions of the same form, fields get added, removed, and
            edited. Hence we pre-generate mappings containing labels
            and fields for all versions so we can use them later as a
            canvas to keep the export coherent.

            Labels are used as column headers.

            Field are used to create rows of data from submission.
        """

        # TODO: refactor this to use FormPack.get_fields_for_versions

        section_fields = OrderedDict()  # {section: [(name, field), (name...))]}
        section_labels = OrderedDict()  # {section: [field_label, field_label]}
        processed_fields = {}  # Used to avoid expensive lookups

        versions = list(self.versions.values())

        # List of fields we generate ourself to add at the very ends
        # of the field list
        auto_fields = OrderedDict()

        # Create the initial field mappings from the first form version
        for section_name, section in versions[0].sections.items():

            # Field mapping to the section containing them
            section_fields[section_name] = list(section.fields.items())

            # Field labels list mapping to the section containing them
            one_section_labels = section_labels[section_name] = []
            for field in section.fields.values():
                labels = field.get_labels(lang, group_sep,
                                          hierarchy_in_labels,
                                          multiple_select)
                one_section_labels.append(labels)

            # Set of processed field names for fast lookup
            field_names = section_fields[section_name]
            processed_fields[section_name] = set(field_names)

            # Append optional additional fields
            auto_field_names = auto_fields[section_name] = []
            if section.children or self.force_index:
                auto_field_names.append('_index')

            if section.parent:
                auto_field_names.append('_parent_table_name')
                auto_field_names.append('_parent_index')

        # Process any new field added in the next versions
        # The hard part is to insert it at a position that makes sense
        for version in versions[1:]:
            for section_name, section in version.sections.items():

                # List of fields and labels we already got for this section
                # from all previous versions
                base_fields_list = section_fields[section_name]
                processed_field_names = processed_fields[section_name]
                base_fields_labels = section_labels[section_name]

                # Potential new fields we want to add
                new_fields = list(section.fields.keys())

                for i, new_field_name in enumerate(new_fields):
                    # Extract the labels for this field, language, group
                    # separator and muliple_select policy
                    labels = field.get_labels(lang, group_sep,
                                              hierarchy_in_labels,
                                              multiple_select)
                    # WARNING, labels is a list of labels for this field
                    # since multiple select answers can span on several columns

                    # We already processed that field and don't need to add it
                    # But we replace the labels for it by the last
                    # version available
                    if new_field_name in processed_field_names:
                        base_labels = enumerate(list(base_fields_labels))
                        for i, _labels in base_labels:
                            if len(_labels) != 2:
                                # e.g. [u'location', u'_location_latitude',...]
                                continue
                            (name, field) = _labels
                            if name == new_field_name:
                                base_fields_labels[i] = labels
                                break
                        continue

                    # If the field appear at the start, append it at the
                    # begining of the lists
                    if i == 0:
                        base_fields_list.insert(0, new_field_name)
                        base_fields_labels.insert(0, labels)
                        continue

                    # For any other field, we need a more advanced position
                    # logic.
                    # We take this new field, and look for all new fields after
                    # it to find the first one that is already in the base
                    # fields. Then we get its index, so we can insert our fresh
                    # new field right before it. This gives us a coherent
                    # order of fields so that they are always, at worst,
                    # adjacent to the last field they used to be to.
                    for following_new_field in new_fields[i+1:]:
                        if following_new_field in processed_field_names:
                            base_fields = list(base_fields_list)
                            for y, (name, field) in enumerate(base_fields):
                                if name == following_new_field:
                                    base_fields_list.insert(y, new_field)
                                    base_fields_labels.insert(y, labels)
                                    break
                            break
                    else:  # We could not find one, so ad it at the end
                        base_fields_list.append(new_field_name)
                        base_fields_labels.append(labels)

                    processed_field_names.add(new_field_name)

        # Flatten field labels and names. Indeed, field.get_labels()
        # and self.names return a list because a multiple select field can
        # have several values. We needed them grouped to insert them at the
        # proper index, but now we want just list of all of them.

        # Flatten all the names for all the value of all the fields
        for section, fields in list(section_fields.items()):
            name_lists = []
            for _field_data in fields:
                if len(_field_data) != 2:
                    # e.g. [u'location', u'_location_latitude',...]
                    continue
                (field_name, field) = _field_data
                name_lists.append(field.value_names)

            names = [name for name_list in name_lists for name in name_list]

            # add auto fields:
            names.extend(auto_fields[section])

            section_fields[section] = names

        # Flatten all the labels for all the headers of all the fields
        for section, labels in list(section_labels.items()):
            labels = [label for label_group in labels for label in label_group]

            # add auto fields (names and labels are the same)
            labels.extend(auto_fields[section])

            section_labels[section] = labels

        return section_fields, section_labels