Пример #1
0
    def parse_code_ul(self, url, ul):
        """Fill the toc item"""
        li_list = ul.find_all('li', recursive=False)
        li = li_list[0]
        span_title = li.find('span',
                             attrs={'class': re.compile(r'TM\d+Code')},
                             recursive=False)

        section = Section(span_title.attrs['id'], span_title.text.strip())
        div_italic = li.find('div', attrs={'class': 'italic'}, recursive=False)
        if div_italic:
            section.content = div_italic.text.strip()
        span_link = li.find('span',
                            attrs={'class': 'codeLienArt'},
                            recursive=False)
        if span_link:
            a_link = span_link.find('a', recursive=False)
            if self.with_articles:
                service = self.section_service
                section.articles = service.articles(self.id_code,
                                                    section.id_section,
                                                    self.date_pub)
            else:
                section.articles = a_link.text.strip()
            section.url_section = cleanup_url(
                urljoin(url, a_link.attrs['href']))
        section.children = [self.parse_code_ul(url, child)
                            for child in li.find_all('ul', recursive=False)]
        return section
def parse_pending_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h3'):
        year = int(year_header.get_text())
        ul = year_header.find_next_sibling('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text()
            nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text)

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            qs_legi = parse_qs(urlparse(url_legi).query)

            results.append(Law(
                year=year,
                legislature=int(qs_legi['legislature'][0]),
                type=qs_legi['typeLoi'][0],
                title=merge_spaces(link_text),
                nor=nor_num.group(1) if nor_num else None,
                url_legi=url_legi,
                id_legi=qs_legi['idDocument'][0]
            ))

    return results
Пример #3
0
    def parse_code(self, url, html):
        """
        Parse the code details and TOC from the given HTML content

        :type  url: str
        :param url: source URL of the page

        :type  html: unicode
        :param html: Content of the HTML

        :return: the code
        """
        soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')

        code = Code(self.id_code,
                    date_pub=self.date_pub,
                    url_code=cleanup_url(url))

        # -- Code title/subtitle
        code.title = soup.h1.text.strip()
        code.subtitle = soup.find('div', {'class': 'vigor-title'}).text.strip()
        regex = (r'Version (?:en vigueur au|abrogée depuis le) '
                 r'(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})')
        m = re.search(regex, code.subtitle)
        if m:
            code.date_pub = parse_date(m.group(1))

        # -- TOC
        toc = soup.find('ul', id='liste-sommaire')
        code.children = [self.parse_toc_element(url, partie)
                         for partie in toc.find_all('li', recursive=False)]

        return code
Пример #4
0
def parse_pending_law_list(url, html, **law_kwargs):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h2'):
        year = int(year_header.get_text().strip())
        ul = year_header.find_next('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text().strip()
            nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text)

            type_loi = re.match(r'(Projet|Proposition)\s+de\s+loi\s+({})?'
                                .format('|'.join(LAW_KINDS)), link_text)
            if type_loi:
                print(type_loi.groups())

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            id_legi = urlparse(url_legi).path.strip('/').split('/')[-1]

            results.append(Law(
                year=year,
                id_legi=id_legi,
                type=type_loi.group(0).lower()[:4],
                kind=type_loi.group(1),
                title=merge_spaces(link_text),
                nor=nor_num.group(1) if nor_num else None,
                url_legi=url_legi,
                **law_kwargs
            ))

    return results
def parse_pending_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h3'):
        year = int(year_header.get_text())
        ul = year_header.find_next_sibling('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text()
            nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text)

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            qs_legi = parse_qs(urlparse(url_legi).query)

            results.append(
                Law(year=year,
                    legislature=int(qs_legi['legislature'][0]),
                    type=qs_legi['typeLoi'][0],
                    title=merge_spaces(link_text),
                    nor=nor_num.group(1) if nor_num else None,
                    url_legi=url_legi,
                    id_legi=qs_legi['idDocument'][0]))

    return results
Пример #6
0
def parse_law(url, html, id_legi):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    law = Law(
        url_legi=cleanup_url(url),
        id_legi=id_legi
    )

    clean_title = merge_spaces(soup.h2.get_text()).strip()
    law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip()

    if len(law.title) == 0:
        return None

    title_remain = None
    law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title,
                       re.I)
    if law_num:
        law.type = 'law'
        law.kind = law_num.group(1)
        law.number = law_num.group(2)
        title_remain = law_num.group(3)

    prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)',
                    law.title, re.I)
    if prop:
        law.type = prop.group(1).lower()

        try:
            LAW_KINDS.index(prop.group(2))
            law.kind = prop.group(2)
        except ValueError:
            # not in list
            law.kind = None

    if title_remain:
        pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                            title_remain)

        if pub_date:
            law.pub_date = parse_date(pub_date.group(1))

    dos_senat = soup.find(lambda e: e.name == 'a' and (
            re.search(r'/dossier-legislatif/', e['href']) or
            re.search(r'/dossierleg/', e['href'])))
    if dos_senat:
        law.url_senat = dos_senat['href'].split('#')[0]
        law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1)

    dos_an = soup.find(lambda e: e.name == 'a' and
                       re.search(r'/dossiers/', e['href']))

    if dos_an:
        law.url_an = dos_an['href'].split('#')[0]
        law.legislature = int(re.search(r'/(\d+)/dossiers/',
                                        law.url_an).group(1))
        law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1)

    return law
Пример #7
0
def parse_law(url, html, id_legi):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    law = Law(url_legi=cleanup_url(url), id_legi=id_legi)

    clean_title = merge_spaces(soup.h2.get_text()).strip()
    law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip()

    if len(law.title) == 0:
        return None

    title_remain = None
    law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title,
                       re.I)
    if law_num:
        law.type = 'law'
        law.kind = law_num.group(1)
        law.number = law_num.group(2)
        title_remain = law_num.group(3)

    prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)',
                    law.title, re.I)
    if prop:
        law.type = prop.group(1).lower()

        try:
            LAW_KINDS.index(prop.group(2))
            law.kind = prop.group(2)
        except ValueError:
            # not in list
            law.kind = None

    if title_remain:
        pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                            title_remain)

        if pub_date:
            law.pub_date = parse_date(pub_date.group(1))

    dos_senat = soup.find(lambda e: e.name == 'a' and
                          (re.search(r'/dossier-legislatif/', e['href']) or re.
                           search(r'/dossierleg/', e['href'])))
    if dos_senat:
        law.url_senat = dos_senat['href'].split('#')[0]
        law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1)

    dos_an = soup.find(
        lambda e: e.name == 'a' and re.search(r'/dossiers/', e['href']))

    if dos_an:
        law.url_an = dos_an['href'].split('#')[0]
        law.legislature = int(
            re.search(r'/(\d+)/dossiers/', law.url_an).group(1))
        law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1)

    return law
def parse_common_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    div = soup.find('div', {'id': 'content_right'})
    ul = div.find('ul')

    re_find_common = re.compile(r'dite?[: ]+(?:loi )?"\s*([^"]+?)\s*"', re.I)
    re_find_second = re.compile(r'"\s*ou ((?:loi )?)"\s*([^"]+?)\s*"', re.I)

    for law_entry in ul.select('li'):
        link = law_entry.find('a')
        if not link:
            continue

        link_text = _clean_typos_legifrance(law_entry.get_text())
        nor_num = re.search(r'NOR\s*([A-Z0-9]+)\n', link_text)
        url_legi = cleanup_url(urljoin(url, link['href']))
        qs_legi = parse_qs(urlparse(url_legi).query)

        text_parts = link_text.strip("\n\r\t )").split('\n')
        title = merge_spaces(text_parts[0])
        common_text = merge_spaces(text_parts[-1]).strip("() ")
        try:
            common = re_find_common.search(common_text).group(1)
        except Exception:
            common = common_text
        try:
            second = re_find_second.search(common_text)
            common += " ; %s" % "".join(second.groups())
        except Exception:
            pass

        results.append(
            Law(
                title=title,
                common_name=common.replace('Loi', 'loi'),
                nor=nor_num.group(1) if nor_num else None,
                url_legi=url_legi,
                id_legi=qs_legi['cidTexte'][0]
            )
        )

    return results
Пример #9
0
    def parse_code(self, url, html):
        """
        Parse the code details and TOC from the given HTML content

        :type  url: str
        :param url: source URL of the page

        :type  html: unicode
        :param html: Content of the HTML

        :return: the code
        """
        soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')

        # -- main text
        div = (soup
               .find('div', id='content_false')
               .find('div', attrs={'class': 'data'}))

        code = Code(self.id_code,
                    date_pub=self.date_pub,
                    url_code=cleanup_url(url))

        # -- Code title/subtitle
        div_title = div.find('div', id='titreTexte')
        span_subtitle = div_title.find('span',
                                       attrs={'class': 'sousTitreTexte'})
        if span_subtitle:
            code.title = div_title.text.replace(span_subtitle.text, '')
            code.subtitle = span_subtitle.text.strip()
            regex = r'Version consolidée au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})'
            m = re.search(regex, code.subtitle)
            if m:
                code.date_pub = parse_date(m.group(1))

        code.title = code.title.strip()

        # -- TOC
        code.children = [self.parse_code_ul(url, child)
                         for child in div.find_all('ul', recursive=False)]

        return code
def parse_published_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h3'):
        year = int(year_header.get_text())
        ul = year_header.find_next_sibling('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text()
            law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)',
                               link_text, re.I)

            if not law_num:
                continue

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            qs_legi = parse_qs(urlparse(url_legi).query)

            title = law_entry.next_sibling
            pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                                title)

            results.append(Law(
                year=year,
                legislature=int(qs_legi['legislature'][0]),
                number=law_num.group(2),
                type='law',
                kind=law_num.group(1),
                pub_date=parse_date(pub_date.group(1)) if pub_date else None,
                title=merge_spaces(link_text + title),
                url_legi=url_legi,
                id_legi=qs_legi['idDocument'][0]
            ))

    return results
Пример #11
0
def parse_published_law_list(url, html, **law_args):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h2'):
        year = int(year_header.get_text().strip())
        ul = year_header.find_next('ul')

        if not ul:
            print('No ul found')
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text().strip()
            law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)',
                               link_text, re.I)

            if not law_num:
                continue

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            id_legi = law_entry['href'].strip('/').split('/')[1]

            pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                                link_text[len(law_num.group(0)):])

            results.append(
                Law(year=year,
                    number=law_num.group(2),
                    type='law',
                    kind=law_num.group(1),
                    pub_date=parse_date(pub_date.group(1))
                    if pub_date else None,
                    title=merge_spaces(link_text),
                    url_legi=url_legi,
                    id_legi=id_legi,
                    **law_args))

    return results
def parse_published_law_list(url, html):
    soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
    results = []

    for year_header in soup.find_all('h3'):
        year = int(year_header.get_text())
        ul = year_header.find_next_sibling('ul')

        if not ul:
            continue

        for law_entry in ul.select('li a'):
            link_text = law_entry.get_text()
            law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)',
                               link_text, re.I)

            if not law_num:
                continue

            url_legi = cleanup_url(urljoin(url, law_entry['href']))
            qs_legi = parse_qs(urlparse(url_legi).query)

            title = law_entry.next_sibling
            pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})',
                                title)

            results.append(
                Law(year=year,
                    legislature=int(qs_legi['legislature'][0]),
                    number=law_num.group(2),
                    type='law',
                    kind=law_num.group(1),
                    pub_date=parse_date(pub_date.group(1))
                    if pub_date else None,
                    title=merge_spaces(link_text + title),
                    url_legi=url_legi,
                    id_legi=qs_legi['idDocument'][0]))

    return results