def parse_article_id(tokens, i, node): node['id'] = '' # article {articleId} de {lawReference} if i < len(tokens) and tokens[i] == 'L' and tokens[i + 1] == '.': while not re.compile('\d+(-\d+)?').match(tokens[i]): node['id'] += tokens[i] i += 1 if i < len(tokens) and re.compile('\d+(-\d+)?').match(tokens[i]): node['id'] += tokens[i] # skip {articleId} and the following space i += 1 i = alinea_lexer.skip_spaces(tokens, i) # {articleId} {articleLetter} # FIXME: handle the {articleLetter}{multiplicativeAdverb} case? if i < len(tokens) and re.compile('^[A-Z]$').match(tokens[i]): node['id'] += ' ' + tokens[i] # skip {articleLetter} and the following space i += 1 i = alinea_lexer.skip_spaces(tokens, i) i = parse_multiplicative_adverb(tokens, i, node) if not node['id'] or is_space(node['id']): del node['id'] return i
def parse_definition_list(tokens, i, parent): if i >= len(tokens): return i i = parse_definition(tokens, i, parent) i = alinea_lexer.skip_spaces(tokens, i) if ((i + 2 < len(tokens) and tokens[i] == u',' and tokens[i + 2] in [u'à', u'au']) or (i + 2 < len(tokens) and tokens[i] == u'et')): i = parse_definition_list(tokens, i + 2, parent) i = alinea_lexer.skip_spaces(tokens, i) # est rédigé(es) # ainsi rédigé(es) # est ainsi rédigé(es) if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): i += 6 def_nodes = filter_nodes( parent, lambda x: 'type' in x and x['type'] in def_types) for def_node in def_nodes: i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, def_node) return i
def parse_header2_definition(tokens, i, parent): if i >= len(tokens): return i debug(parent, tokens, i, 'parse_header2_definition') # un ... ° ({articlePartRef}) if tokens[i].lower() == u'un' and ''.join( tokens[i + 2:i + 5]) == u'...' and tokens[i + 6] == u'°': node = create_node(parent, {'type': 'header2', 'children': []}) # FIXME: should we simply ignore the 'order' field all together? node['order'] = '...' i += 8 i = alinea_lexer.skip_spaces(tokens, i) if tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i + 4) i = parse_quote(tokens, i, node) # un {order}° ({orderLetter}) ({multiplicativeAdverb}) ({articlePartRef}) elif tokens[i].lower() == u'un' and re.compile(u'\d+°').match( tokens[i + 2]): node = create_node(parent, {'type': 'header2', 'children': []}) node['order'] = parse_int(tokens[i + 2]) i += 4 if re.compile(u'[A-Z]').match(tokens[i]): node['subOrder'] = tokens[i] i += 2 i = parse_multiplicative_adverb(tokens, i, node) i = parse_article_part_reference(tokens, i, node) i = alinea_lexer.skip_spaces(tokens, i) if i < len(tokens) and tokens[i] == u'ainsi' and tokens[ i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i + 4) i = parse_quote(tokens, i, node) # des {start}° à {end}° elif (tokens[i].lower() == u'des' and re.compile(u'\d+°').match(tokens[i + 2]) and tokens[i + 4] == u'à' and re.compile(u'\d+°').match(tokens[i + 6])): start = parse_int(tokens[i + 2]) end = parse_int(tokens[i + 6]) i += 8 # ainsi rédigés if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): i = alinea_lexer.skip_to_quote_start(tokens, i + 4) i = parse_for_each( parse_quote, tokens, i, lambda: create_node(parent, { 'type': 'header2', 'children': [] })) else: debug(parent, tokens, i, 'parse_header2_definition end') return i return i
def parse_reference_list(tokens, i, parent): if i >= len(tokens): return i i = parse_reference(tokens, i, parent) i = alinea_lexer.skip_spaces(tokens, i) if ((i + 2 < len(tokens) and tokens[i] == u',' and tokens[i + 2] in [u'à', u'au']) or (i + 2 < len(tokens) and tokens[i] == u'et')): i = parse_reference_list(tokens, i + 2, parent) i = alinea_lexer.skip_spaces(tokens, i) return i
def parse_alinea_definition(tokens, i, parent): if i >= len(tokens): return i debug(parent, tokens, i, 'parse_alinea_definition') # {count} alinéa(s) if is_number_word(tokens[i]) and tokens[i + 2].startswith(u'alinéa'): count = word_to_number(tokens[i]) i += 4 # ainsi rédigé # est rédigé # est ainsi rédigé if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): # we expect {count} definitions => {count} quotes # but they don't always match, so for now we parse all of the available contents # FIXME: issue a warning because the expected count doesn't match? i = alinea_lexer.skip_spaces(tokens, i) i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each( parse_quote, tokens, i, lambda: create_node(parent, { 'type': 'alinea', 'children': [] })) else: node = create_node(parent, {'type': 'alinea', 'count': count}) else: debug(parent, tokens, i, 'parse_alinea_definition none') return i debug(parent, tokens, i, 'parse_alinea_definition end') return i
def parse_article_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, { 'type': 'article', 'children': [], }) debug(parent, tokens, i, 'parse_article_definition') # un article if tokens[i] == u'un' and tokens[i + 2] == u'article': i += 4 # l'article elif tokens[i] == u'l' and tokens[i + 2] == u'article': i += 4 else: debug(parent, tokens, i, 'parse_article_definition none') remove_node(parent, node) return i i = parse_article_id(tokens, i, node) i = alinea_lexer.skip_spaces(tokens, i) if i < len(tokens) and tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) debug(parent, tokens, i, 'parse_article_definition end') return i
def parse_title_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, { 'type': 'title', 'children': [], }) debug(parent, tokens, i, 'parse_title_definition') # un titre {order} if tokens[i].lower() == u'un' and tokens[ i + 2] == u'titre' and is_roman_number(tokens[i + 4]): node['order'] = parse_roman_number(tokens[i + 4]) i += 6 i = parse_multiplicative_adverb(tokens, i, node) else: debug(parent, tokens, i, 'parse_title_definition none') remove_node(parent, node) return i i = alinea_lexer.skip_spaces(tokens, i) if tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) debug(parent, tokens, i, 'parse_title_definition end') return i
def parse_bill_header3(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, { 'type': 'bill-header3', 'children': [], }) debug(parent, tokens, i, 'parse_bill_header3') i = alinea_lexer.skip_spaces(tokens, i) match = re.compile('([a-z]+)').match(tokens[i]) if match and (tokens[i + 1] == u')' or (tokens[i + 2] == u'(' and tokens[i + 5] == u')')): node['order'] = ord(match.group()[0].encode('utf-8')) - ord('a') + 1 # skip'{number}) ' or '{number} (nouveau))' if tokens[i + 1] == u')': i += 3 else: i += 7 # i = parse_edit(tokens, i, node) else: remove_node(parent, node) node = parent i = parse_edit(tokens, i, node) if node != parent and len(node['children']) == 0: remove_node(parent, node) debug(parent, tokens, i, 'parse_bill_header3 end') return i
def parse_bill_header2(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, { 'type': 'bill-header2', 'order': 0, 'children': [], }) debug(parent, tokens, i, 'parse_bill_header2') i = alinea_lexer.skip_spaces(tokens, i) if re.compile(u'\d+°').match(tokens[i]): debug(parent, tokens, i, 'parse_bill_header2 found article header-2') node['order'] = parse_int(tokens[i]) # skip {number}° i = alinea_lexer.skip_to_next_word(tokens, i + 2) else: remove_node(parent, node) node = parent i = parse_edit(tokens, i, node) i = parse_for_each(parse_bill_header3, tokens, i, node) if node != parent and len(node['children']) == 0: remove_node(parent, node) debug(parent, tokens, i, 'parse_bill_header2 end') return i
def parse_bill_header1(tokens, i, parent): if i >= len(tokens): return i i = alinea_lexer.skip_spaces(tokens, i) node = create_node(parent, { 'type': 'bill-header1', 'order': 0, 'children': [], }) debug(parent, tokens, i, 'parse_bill_header1') # skip'{romanNumber}.' if is_roman_number(tokens[i]) and tokens[i + 1] == u'.': debug(parent, tokens, i, 'parse_bill_header1 found article header-1') node['order'] = parse_roman_number(tokens[i]) i = alinea_lexer.skip_to_next_word(tokens, i + 2) j = i i = parse_edit(tokens, i, node) i = parse_for_each(parse_bill_header2, tokens, i, node) if i == j: i = parse_raw_article_content(tokens, i, node) if len(node['children']) == 0: remove_node(parent, node) else: node['order'] = len( filter(lambda x: x['type'] == node['type'], parent['children'])) debug(parent, tokens, i, 'parse_bill_header1 end') return i
def parse_law_reference(tokens, i, parent): if i >= len(tokens): return i j = i node = create_node(parent, { 'type': 'law-reference', 'lawId': '', 'children': [], }) debug(parent, tokens, i, 'parse_law_reference') # de l'ordonnance # l'ordonnance if i + 4 < len(tokens) and (tokens[i + 2] == u'ordonnance' or tokens[i + 4] == u'ordonnance'): node['lawType'] = 'ordonnance' i = alinea_lexer.skip_to_token(tokens, i, u'ordonnance') + 2 # de la loi # la loi elif i + 4 < len(tokens) and ( (tokens[i] == u'la' and tokens[i + 2] == u'loi') or (tokens[i] == u'de' and tokens[i + 4] == u'loi')): i = alinea_lexer.skip_to_token(tokens, i, u'loi') + 2 else: remove_node(parent, node) return i if tokens[i] == u'organique': node['lawType'] = 'organic' i += 2 i = alinea_lexer.skip_to_token(tokens, i, u'n°') + 1 # If we didn't find the "n°" token, the reference is incomplete and we forget about it. # FIXME: we might have to handle the "la même ordonnance" or "la même loi" incomplete reference cases. if i >= len(tokens): remove_node(parent, node) return j i = alinea_lexer.skip_spaces(tokens, i) node['lawId'] = tokens[i] # skip {lawId} and the following space i += 2 if i < len(tokens) and tokens[i] == u'du': node['lawDate'] = tokens[i + 6] + u'-' + str( month_to_number(tokens[i + 4])) + u'-' + tokens[i + 2] # skip {lawDate} and the following space i += 7 # i = alinea_lexer.skip_spaces(tokens, i) # if tokens[i] == u'relative': # print('foo') debug(parent, tokens, i, 'parse_law_reference end') return i
def parse_multiplicative_adverb(tokens, i, node): if i >= len(tokens): return i adverbs = alinea_lexer.TOKEN_MULTIPLICATIVE_ADVERBS.sort( key=lambda s: -len(s)) for adverb in alinea_lexer.TOKEN_MULTIPLICATIVE_ADVERBS: if tokens[i].endswith(adverb): node['is' + adverb.title()] = True # skip {multiplicativeAdverb} and the following space i += 1 i = alinea_lexer.skip_spaces(tokens, i) return i return i
def parse_quote(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'quote', 'words': ''}) debug(parent, tokens, i, 'parse_quote') i = alinea_lexer.skip_spaces(tokens, i) # " if tokens[i] == alinea_lexer.TOKEN_DOUBLE_QUOTE_OPEN: i += 1 # # est rédigé(es) # # ainsi rédigé(es) # # est ainsi rédigé(es) # elif (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') # or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))): # i = alinea_lexer.skip_to_quote_start(tokens, i + 2) + 1 else: remove_node(parent, node) return i while i < len(tokens) and tokens[ i] != alinea_lexer.TOKEN_DOUBLE_QUOTE_CLOSE and tokens[ i] != alinea_lexer.TOKEN_NEW_LINE: node['words'] += tokens[i] i += 1 # skipalinea_lexer.TOKEN_DOUBLE_QUOTE_CLOSE i += 1 i = alinea_lexer.skip_spaces(tokens, i) debug(parent, tokens, i, 'parse_quote end') return i
def parse_header1_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'header1', 'children': []}) debug(parent, tokens, i, 'parse_header1_definition') # un {romanPartNumber} if tokens[i].lower() == u'un' and is_roman_number(tokens[i + 2]): node['order'] = parse_roman_number(tokens[i + 2]) i += 4 i = alinea_lexer.skip_spaces(tokens, i) if i + 2 < len(tokens) and tokens[i] == u'ainsi' and tokens[ i + 2] == u'rédigé': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) else: debug(parent, tokens, i, 'parse_header1_definition end') remove_node(parent, node) return i return i
def parse_words_definition(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'words', 'children': []}) debug(parent, tokens, i, 'parse_words_definition') j = i i = parse_position(tokens, i, node) # le mot # les mots # des mots if tokens[i].lower() in [u'le', u'les', u'des' ] and tokens[i + 2].startswith(u'mot'): i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_for_each(parse_quote, tokens, i, node) # i = alinea_lexer.skip_spaces(tokens, i) # le nombre # le chiffre elif tokens[i].lower() in [u'le' ] and tokens[i + 2] in [u'nombre', u'chiffre']: i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) # " elif tokens[i] == alinea_lexer.TOKEN_DOUBLE_QUOTE_OPEN: i = parse_for_each(parse_quote, tokens, i, node) i = alinea_lexer.skip_spaces(tokens, i) # la référence elif tokens[i] == u'la' and tokens[i + 2] == u'référence': i = alinea_lexer.skip_to_quote_start(tokens, i) i = parse_quote(tokens, i, node) else: debug(parent, tokens, i, 'parse_words_definition none') remove_node(parent, node) return j debug(parent, tokens, i, 'parse_words_definition end') return i
def parse_article_reference(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'article-reference', 'id': ''}) debug(parent, tokens, i, 'parse_article_reference') j = i i = parse_position(tokens, i, node) # de l'article # à l'article if tokens[i].lower() in [ u'de', u'à' ] and tokens[i + 2] == u'l' and tokens[i + 4] == u'article': i += 5 i = alinea_lexer.skip_spaces(tokens, i) # l'article elif tokens[i].lower() == u'l' and tokens[ i + 1] == alinea_lexer.TOKEN_SINGLE_QUOTE and tokens[i + 2] == u'article': i += 3 i = alinea_lexer.skip_spaces(tokens, i) # elif tokens[i] == u'un' and tokens[i + 2] == u'article': # i += 4 # Article {articleNumber} elif tokens[i].lower().startswith(u'article'): i += 1 i = alinea_lexer.skip_spaces(tokens, i) # le même article elif tokens[i].lower() == u'le' and tokens[i + 2] == u'même' and tokens[ i + 4] == u'article': i += 6 article_refs = filter_nodes( get_root(parent), lambda n: 'type' in n and n['type'] == 'article-reference') # the last one in order of traversal is the previous one in order of syntax # don't forget the current node is in the list too => -2 instead of -1 article_ref = copy_node(article_refs[-2]) push_node(parent, article_ref) remove_node(parent, node) else: remove_node(parent, node) return j i = parse_article_id(tokens, i, node) # i = parse_article_part_reference(tokens, i, node) # de la loi # de l'ordonnance # du code # les mots # l'alinéa i = parse_one_of([ parse_law_reference, parse_code_reference, parse_words_reference, parse_alinea_reference ], tokens, i, node) # i = parse_quote(tokens, i, node) debug(parent, tokens, i, 'parse_article_reference end') return i
def parse_edit(tokens, i, parent): if i >= len(tokens): return i node = create_node(parent, {'type': 'edit'}) debug(parent, tokens, i, 'parse_edit') r = i # i = parse_for_each(parse_reference, tokens, i, node) i = parse_reference_list(tokens, i, node) # if we did not parse a reference i = alinea_lexer.skip_spaces(tokens, i) # if we didn't find any reference as a subject and the subject/verb are not reversed if len(node['children'] ) == 0 and tokens[i] != 'Est' and tokens[i] != 'Sont': remove_node(parent, node) debug(parent, tokens, i, 'parse_edit none') return i # i = r i = alinea_lexer.skip_tokens( tokens, i, lambda t: t.lower() not in [u'est', u'sont', u'devient'] and not t == u'.') if i + 2 >= len(tokens): remove_node(parent, node) debug(parent, tokens, i, 'parse_edit eof') return r # sont supprimés # sont supprimées # est supprimé # est supprimée # est abrogé # est abrogée # sont abrogés # sont abrogées if tokens[i + 2].startswith(u'supprimé') or tokens[i + 2].startswith(u'abrogé'): node['editType'] = 'delete' i = alinea_lexer.skip_to_end_of_line(tokens, i) # est ainsi rédigé # est ainsi rédigée # est ainsi modifié # est ainsi modifiée elif tokens[i + 4].startswith(u'rédigé') or tokens[i + 4].startswith(u'modifié'): node['editType'] = 'edit' i = alinea_lexer.skip_to_end_of_line(tokens, i) i = alinea_lexer.skip_spaces(tokens, i) i = parse_definition(tokens, i, node) # est remplacé par # est remplacée par # sont remplacés par # sont remplacées par elif tokens[i + 2].startswith(u'remplacé'): node['editType'] = 'replace' i += 6 i = parse_definition(tokens, i, node) i = alinea_lexer.skip_to_end_of_line(tokens, i) # remplacer elif tokens[i].lower() == u'remplacer': node['editType'] = 'replace' i += 2 # i = parse_definition(tokens, i, node) i = parse_reference(tokens, i, node) i = alinea_lexer.skip_to_end_of_line(tokens, i) if tokens[i].lower() == 'par': i += 2 i = parse_definition(tokens, i, node) i = alinea_lexer.skip_to_end_of_line(tokens, i) # est inséré # est insérée # sont insérés # sont insérées # est ajouté # est ajoutée # sont ajoutés # sont ajoutées elif tokens[i + 2].startswith(u'inséré') or tokens[i + 2].startswith(u'ajouté'): node['editType'] = 'add' i += 4 i = parse_definition(tokens, i, node) i = alinea_lexer.skip_to_end_of_line(tokens, i) # est ainsi rétabli elif tokens[i + 4].startswith(u'rétabli'): node['editType'] = 'add' i = alinea_lexer.skip_to_end_of_line(tokens, i) i = alinea_lexer.skip_spaces(tokens, i) i = parse_definition(tokens, i, node) # est complété par elif tokens[i + 2] == u'complété': node['editType'] = 'add' i += 6 # i = parse_definition(tokens, i, node) i = parse_definition_list(tokens, i, node) # i = alinea_lexer.skip_to_end_of_line(tokens, i) # devient elif tokens[i] == u'devient': node['editType'] = 'rename' i += 2 i = parse_definition(tokens, i, node) else: i = r debug(parent, tokens, i, 'parse_edit remove') remove_node(parent, node) i = parse_raw_article_content(tokens, i, parent) i = alinea_lexer.skip_to_end_of_line(tokens, i) return i # We've parsed pretty much everything we could handle. At this point, # there should be no meaningful content. But their might be trailing # spaces or ponctuation (ofent "." or ";"), so we alinea_lexer.skip_ to the end of # the line. i = alinea_lexer.skip_to_end_of_line(tokens, i) debug(parent, tokens, i, 'parse_edit end') return i