def _build_vflist(elements): # Iterate through the list of elements, keeping track of state, and # for each vf element create a VariantForm object based on current # state (the set of dates and grammar information governing the vf). # 'state' is used to keep track of state: # state['outside'] keeps track of the main state, i.e. state outside # parens. Includes: # -- last encountered date-range; # -- last encountered grammar information (<gr>); # -- last encountered label (<la>). # state['inside'] keeps track of the state inside parens. # Includes: # -- last encountered date-range; # -- last encountered grammar information (<gr>); # -- last encountered label (<la>). # This is cleared every time we hit a closing paren. # state['parens'] keeps track of whether we're inside or outside # parens (value is 'inside' or 'outside'). # Each time a <vf> is encountered, a VariantForm is created # for it using information from the current state. Usually this is # state['outside']. Values from state['inside'] are only used if the # <vf> is inside parens (i.e. state['parens'] == 'inside') *and* the # value of state['inside'] is populated; otherwise, the corresponding # value from state['outside'] is used by default. # All __state is cleared every time we reach a hard break (e.g. a para). vf_list = [] state = StateMachine() for element in elements: element = _adjust_element(element, state) if element.tag == 'vf': date_start, date_end = state.read('date_range') if not date_start: date_start, date_end = state.read('date_range', paren_state='outside') grammar = (state.read('grammar') or state.read('grammar', paren_state='outside')) label = (state.read('label') or state.read('label', paren_state='outside')) variant_form = VariantFormFromParser(element.node, date_start, date_end) variant_form.set_grammatical_information(grammar) variant_form.label = label vf_list.append(variant_form) elif element.tag == 'vd': state.set('date_range', utilities.find_range(element.text)) if element.previous != 'la': state.set('label', '') elif element.tag == 'hardBreak' or element.tag == 'p': state.clear() elif element.tag == 'softBreak': state.set('parens', 'outside') state.set('label', '') elif element.tag == 'openParen' and element.next != 'vf': state.set('parens', 'inside') # Make sure that grammar and date-range inside parens are set to null state.set('grammar', '') state.set('label', '') state.set('date_range', (0, 0)) elif element.tag == 'closeParen': state.set('parens', 'outside') elif ((element.tag == 'gr' or element.tag == 'text') and element.text and not IGNORABLE_GRAMMAR_PATTERN.search(element.text)): state.set('grammar', element.text) elif element.tag == 'la': state.set('label', element.text) elif element.tag == 'newStart': vf_list = [] state.clear() return vf_list