def parse_words_reference(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'words-reference'}) debug(parent, tokens, i, 'parse_words_reference') j = i i = alinea_lexer.skip_to_next_word(tokens, i) i = parse_position(tokens, i, node) # le mot # les mots # des mots if tokens[i].lower() in [u'le', u'les', u'des' ] and tokens[i + 2].startswith(u'mot'): i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) # le nombre # le chiffre elif tokens[i].lower() in [u'le' ] and tokens[i + 2] in [u'nombre', u'chiffre']: i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) # la référence elif tokens[i].lower() in [u'la'] and tokens[i + 2] == u'référence': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) else: debug(parent, tokens, i, 'parse_words_reference none') remove_node(parent, node) return j debug(parent, tokens, i, 'parse_words_reference end') return i
def parse_header2_definition(tokens, i, parent): if i >= len(tokens): return i debug(parent, tokens, i, 'parse_header2_definition') # un ... ° ({articlePartRef}) if tokens[i].lower() == u'un' and ''.join( tokens[i + 2:i + 5]) == u'...' and tokens[i + 6] == u'°': node = create_node(parent, {'type': 'header2', 'children': []}) # FIXME: should we simply ignore the 'order' field all together? node['order'] = '...' i += 8 i = alinea_lexer.skip_spaces(tokens, i) if tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i + 4) i = parse_quote(tokens, i, node) # un {order}° ({orderLetter}) ({multiplicativeAdverb}) ({articlePartRef}) elif tokens[i].lower() == u'un' and re.compile(u'\d+°').match( tokens[i + 2]): node = create_node(parent, {'type': 'header2', 'children': []}) node['order'] = parse_int(tokens[i + 2]) i += 4 if re.compile(u'[A-Z]').match(tokens[i]): node['subOrder'] = tokens[i] i += 2 i = parse_multiplicative_adverb(tokens, i, node) i = parse_article_part_reference(tokens, i, node) i = alinea_lexer.skip_spaces(tokens, i) if i < len(tokens) and tokens[i] == u'ainsi' and tokens[ i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i + 4) i = parse_quote(tokens, i, node) # des {start}° à {end}° elif (tokens[i].lower() == u'des' and re.compile(u'\d+°').match(tokens[i + 2]) and tokens[i + 4] == u'à' and re.compile(u'\d+°').match(tokens[i + 6])): start = parse_int(tokens[i + 2]) end = parse_int(tokens[i + 6]) i += 8 # ainsi rédigés if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): i = alinea_lexer.skip_to_quote_start(tokens, i + 4) i = parse_for_each( parse_quote, tokens, i, lambda: create_node(parent, { 'type': 'header2', 'children': [] })) else: debug(parent, tokens, i, 'parse_header2_definition end') return i return i
def parse_title_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, { 'type': 'title', 'children': [], }) debug(parent, tokens, i, 'parse_title_definition') # un titre {order} if tokens[i].lower() == u'un' and tokens[ i + 2] == u'titre' and is_roman_number(tokens[i + 4]): node['order'] = parse_roman_number(tokens[i + 4]) i += 6 i = parse_multiplicative_adverb(tokens, i, node) else: debug(parent, tokens, i, 'parse_title_definition none') remove_node(parent, node) return i i = alinea_lexer.skip_spaces(tokens, i) if tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) debug(parent, tokens, i, 'parse_title_definition end') return i
def parse_alinea_definition(tokens, i, parent): if i >= len(tokens): return i debug(parent, tokens, i, 'parse_alinea_definition') # {count} alinéa(s) if is_number_word(tokens[i]) and tokens[i + 2].startswith(u'alinéa'): count = word_to_number(tokens[i]) i += 4 # ainsi rédigé # est rédigé # est ainsi rédigé if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): # we expect {count} definitions => {count} quotes # but they don't always match, so for now we parse all of the available contents # FIXME: issue a warning because the expected count doesn't match? i = alinea_lexer.skip_spaces(tokens, i) i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each( parse_quote, tokens, i, lambda: create_node(parent, { 'type': 'alinea', 'children': [] })) else: node = create_node(parent, {'type': 'alinea', 'count': count}) else: debug(parent, tokens, i, 'parse_alinea_definition none') return i debug(parent, tokens, i, 'parse_alinea_definition end') return i
def parse_article_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, { 'type': 'article', 'children': [], }) debug(parent, tokens, i, 'parse_article_definition') # un article if tokens[i] == u'un' and tokens[i + 2] == u'article': i += 4 # l'article elif tokens[i] == u'l' and tokens[i + 2] == u'article': i += 4 else: debug(parent, tokens, i, 'parse_article_definition none') remove_node(parent, node) return i i = parse_article_id(tokens, i, node) i = alinea_lexer.skip_spaces(tokens, i) if i < len(tokens) and tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) debug(parent, tokens, i, 'parse_article_definition end') return i
def parse_definition_list(tokens, i, parent): if i >= len(tokens): return i i = parse_definition(tokens, i, parent) i = alinea_lexer.skip_spaces(tokens, i) if ((i + 2 < len(tokens) and tokens[i] == u',' and tokens[i + 2] in [u'à', u'au']) or (i + 2 < len(tokens) and tokens[i] == u'et')): i = parse_definition_list(tokens, i + 2, parent) i = alinea_lexer.skip_spaces(tokens, i) # est rédigé(es) # ainsi rédigé(es) # est ainsi rédigé(es) if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): i += 6 def_nodes = filter_nodes( parent, lambda x: 'type' in x and x['type'] in def_types) for def_node in def_nodes: i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, def_node) return i
def parse_words_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'words', 'children': []}) debug(parent, tokens, i, 'parse_words_definition') j = i i = parse_position(tokens, i, node) # le mot # les mots # des mots if tokens[i].lower() in [u'le', u'les', u'des' ] and tokens[i + 2].startswith(u'mot'): i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) # i = alinea_lexer.skip_spaces(tokens, i) # le nombre # le chiffre elif tokens[i].lower() in [u'le' ] and tokens[i + 2] in [u'nombre', u'chiffre']: i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) # " elif tokens[i] == alinea_lexer.TOKEN_DOUBLE_QUOTE_OPEN: i = parse_for_each(parse_quote, tokens, i, node) i = alinea_lexer.skip_spaces(tokens, i) # la référence elif tokens[i] == u'la' and tokens[i + 2] == u'référence': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) else: debug(parent, tokens, i, 'parse_words_definition none') remove_node(parent, node) return j debug(parent, tokens, i, 'parse_words_definition end') return i
def parse_header1_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'header1', 'children': []}) debug(parent, tokens, i, 'parse_header1_definition') # un {romanPartNumber} if tokens[i].lower() == u'un' and is_roman_number(tokens[i + 2]): node['order'] = parse_roman_number(tokens[i + 2]) i += 4 i = alinea_lexer.skip_spaces(tokens, i) if i + 2 < len(tokens) and tokens[i] == u'ainsi' and tokens[ i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) else: debug(parent, tokens, i, 'parse_header1_definition end') remove_node(parent, node) return i return i
def parse_mention_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'mention', 'children': []}) debug(parent, tokens, i, 'parse_mention_definition') # la mention if tokens[i].lower() == u'la' and tokens[i + 2] == u'mention': i += 4 else: debug(parent, tokens, i, 'parse_mention_definition none') remove_node(parent, node) return i # : if tokens[i] == ':': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) debug(parent, tokens, i, 'parse_mention_definition end') return i