Пример #1
0
def parse_legislature_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for leg_header in soup.find_all('h2'):
        text = leg_header.get_text()
        text = re.sub(r'\s+', ' ', text)
        num = parse_roman(re.search('^[MDCLXVI]+', text).group(0))

        m = re.search(r'à compter du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text)
        if m:
            start = parse_date(m.group(1))
            end = None
        else:
            start = None
            end = None

        m = re.search(r'du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4}) '
                      r'au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text)
        if m:
            start = parse_date(m.group(1))
            end = parse_date(m.group(2))

        results.append(Legislature(number=num, start=start, end=end))

    return results
def parse_legislature_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for leg_header in soup.find_all('h3'):
        text = leg_header.get_text()
        num = parse_roman(re.search('^[MDCLXVI]+', text).group(0))

        m = re.search(r'A compter du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text)
        if m:
            start = parse_date(m.group(1))
            end = None
        else:
            start = None
            end = None

        m = re.search(r'du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4}) '
                      r'au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text)
        if m:
            start = parse_date(m.group(1))
            end = parse_date(m.group(2))

        results.append(Legislature(number=num, start=start, end=end))

    return results
Пример #3
0
    def parse_code(self, url, html):
        """
        Parse the code details and TOC from the given HTML content

        :type  url: str
        :param url: source URL of the page

        :type  html: unicode
        :param html: Content of the HTML

        :return: the code
        """
        soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')

        code = Code(self.id_code,
                    date_pub=self.date_pub,
                    url_code=cleanup_url(url))

        # -- Code title/subtitle
        code.title = soup.h1.text.strip()
        code.subtitle = soup.find('div', {'class': 'vigor-title'}).text.strip()
        regex = (r'Version (?:en vigueur au|abrogée depuis le) '
                 r'(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})')
        m = re.search(regex, code.subtitle)
        if m:
            code.date_pub = parse_date(m.group(1))

        # -- TOC
        toc = soup.find('ul', id='liste-sommaire')
        code.children = [self.parse_toc_element(url, partie)
                         for partie in toc.find_all('li', recursive=False)]

        return code
Пример #4
0
def parse_law(url, html, id_legi):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    law = Law(
        url_legi=cleanup_url(url),
        id_legi=id_legi
    )

    clean_title = merge_spaces(soup.h2.get_text()).strip()
    law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip()

    if len(law.title) == 0:
        return None

    title_remain = None
    law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title,
                       re.I)
    if law_num:
        law.type = 'law'
        law.kind = law_num.group(1)
        law.number = law_num.group(2)
        title_remain = law_num.group(3)

    prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)',
                    law.title, re.I)
    if prop:
        law.type = prop.group(1).lower()

        try:
            LAW_KINDS.index(prop.group(2))
            law.kind = prop.group(2)
        except ValueError:
            # not in list
            law.kind = None

    if title_remain:
        pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                            title_remain)

        if pub_date:
            law.pub_date = parse_date(pub_date.group(1))

    dos_senat = soup.find(lambda e: e.name == 'a' and (
            re.search(r'/dossier-legislatif/', e['href']) or
            re.search(r'/dossierleg/', e['href'])))
    if dos_senat:
        law.url_senat = dos_senat['href'].split('#')[0]
        law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1)

    dos_an = soup.find(lambda e: e.name == 'a' and
                       re.search(r'/dossiers/', e['href']))

    if dos_an:
        law.url_an = dos_an['href'].split('#')[0]
        law.legislature = int(re.search(r'/(\d+)/dossiers/',
                                        law.url_an).group(1))
        law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1)

    return law
Пример #5
0
def parse_law(url, html, id_legi):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    law = Law(url_legi=cleanup_url(url), id_legi=id_legi)

    clean_title = merge_spaces(soup.h2.get_text()).strip()
    law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip()

    if len(law.title) == 0:
        return None

    title_remain = None
    law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title,
                       re.I)
    if law_num:
        law.type = 'law'
        law.kind = law_num.group(1)
        law.number = law_num.group(2)
        title_remain = law_num.group(3)

    prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)',
                    law.title, re.I)
    if prop:
        law.type = prop.group(1).lower()

        try:
            LAW_KINDS.index(prop.group(2))
            law.kind = prop.group(2)
        except ValueError:
            # not in list
            law.kind = None

    if title_remain:
        pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                            title_remain)

        if pub_date:
            law.pub_date = parse_date(pub_date.group(1))

    dos_senat = soup.find(lambda e: e.name == 'a' and
                          (re.search(r'/dossier-legislatif/', e['href']) or re.
                           search(r'/dossierleg/', e['href'])))
    if dos_senat:
        law.url_senat = dos_senat['href'].split('#')[0]
        law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1)

    dos_an = soup.find(
        lambda e: e.name == 'a' and re.search(r'/dossiers/', e['href']))

    if dos_an:
        law.url_an = dos_an['href'].split('#')[0]
        law.legislature = int(
            re.search(r'/(\d+)/dossiers/', law.url_an).group(1))
        law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1)

    return law
Пример #6
0
    def parse_code(self, url, html):
        """
        Parse the code details and TOC from the given HTML content

        :type  url: str
        :param url: source URL of the page

        :type  html: unicode
        :param html: Content of the HTML

        :return: the code
        """
        soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')

        # -- main text
        div = (soup
               .find('div', id='content_false')
               .find('div', attrs={'class': 'data'}))

        code = Code(self.id_code,
                    date_pub=self.date_pub,
                    url_code=cleanup_url(url))

        # -- Code title/subtitle
        div_title = div.find('div', id='titreTexte')
        span_subtitle = div_title.find('span',
                                       attrs={'class': 'sousTitreTexte'})
        if span_subtitle:
            code.title = div_title.text.replace(span_subtitle.text, '')
            code.subtitle = span_subtitle.text.strip()
            regex = r'Version consolidée au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})'
            m = re.search(regex, code.subtitle)
            if m:
                code.date_pub = parse_date(m.group(1))

        code.title = code.title.strip()

        # -- TOC
        code.children = [self.parse_code_ul(url, child)
                         for child in div.find_all('ul', recursive=False)]

        return code
def parse_published_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h3'):
        year = int(year_header.get_text())
        ul = year_header.find_next_sibling('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text()
            law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)',
                               link_text, re.I)

            if not law_num:
                continue

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            qs_legi = parse_qs(urlparse(url_legi).query)

            title = law_entry.next_sibling
            pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                                title)

            results.append(Law(
                year=year,
                legislature=int(qs_legi['legislature'][0]),
                number=law_num.group(2),
                type='law',
                kind=law_num.group(1),
                pub_date=parse_date(pub_date.group(1)) if pub_date else None,
                title=merge_spaces(link_text + title),
                url_legi=url_legi,
                id_legi=qs_legi['idDocument'][0]
            ))

    return results
Пример #8
0
def parse_published_law_list(url, html, **law_args):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h2'):
        year = int(year_header.get_text().strip())
        ul = year_header.find_next('ul')

        if not ul:
            print('No ul found')
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text().strip()
            law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)',
                               link_text, re.I)

            if not law_num:
                continue

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            id_legi = law_entry['href'].strip('/').split('/')[1]

            pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                                link_text[len(law_num.group(0)):])

            results.append(
                Law(year=year,
                    number=law_num.group(2),
                    type='law',
                    kind=law_num.group(1),
                    pub_date=parse_date(pub_date.group(1))
                    if pub_date else None,
                    title=merge_spaces(link_text),
                    url_legi=url_legi,
                    id_legi=id_legi,
                    **law_args))

    return results
def parse_published_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h3'):
        year = int(year_header.get_text())
        ul = year_header.find_next_sibling('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text()
            law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)',
                               link_text, re.I)

            if not law_num:
                continue

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            qs_legi = parse_qs(urlparse(url_legi).query)

            title = law_entry.next_sibling
            pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                                title)

            results.append(
                Law(year=year,
                    legislature=int(qs_legi['legislature'][0]),
                    number=law_num.group(2),
                    type='law',
                    kind=law_num.group(1),
                    pub_date=parse_date(pub_date.group(1))
                    if pub_date else None,
                    title=merge_spaces(link_text + title),
                    url_legi=url_legi,
                    id_legi=qs_legi['idDocument'][0]))

    return results