예제 #1
0
    def parse_details(self, response):
        full_text = self.extract_with_css(
            response, 'div.avisoContenido div#detalleAviso').extract_first()
        full_text_soup = BeautifulSoup(full_text, 'html.parser')

        title = self.extract_with_css(
            response,
            'div.avisoContenido div#detalleAviso div#tituloDetalleAviso'
        ).extract_first()
        title_soup = BeautifulSoup(title, 'html.parser')

        if self.extract_with_css(
                response,
                'div.avisoContenido div#detalleAviso div#tituloDetalleAviso'
        ).extract_first():
            simple_type = Norm.get_type_from_text(
                self.extract_with_css(
                    response,
                    'div.avisoContenido div#detalleAviso div#tituloDetalleAviso'
                ).extract_first())
        else:
            simple_type = None

        yield Norm({
            'published_at': response.meta['date'],
            'title': title_soup.get_text(),
            'text': full_text_soup.get_text(),
            'type': dict(simple=simple_type),
            'link': response.meta['link'],
            'html': response.text
        })
예제 #2
0
    def parse_saij(self, response):
        full_text = self.extract_with_css(
            response, 'div.resultado-busqueda div#div-texto').extract_first()
        full_text = BeautifulSoup(full_text, 'html.parser').get_text()

        abstract = self.extract_with_css(
            response,
            'div.resultado-busqueda div#div-texto div#texto-norma-container'
        ).extract_first()
        abstract = BeautifulSoup(abstract, 'html.parser').get_text()
        yield Norm({
            'title':
            self.extract_with_css(
                response,
                'div.resultado-busqueda li.result-item dd.tit-resultado h1.p-titulo::text'
            ).extract_first(),
            'text':
            full_text,
            'abstract':
            abstract,
            'link':
            response.meta['link'],
            'html':
            response.text
        })
예제 #3
0
    def parse_infoleg(self, response):
        full_text = response.xpath('//text()').extract()
        full_text = ' '.join(full_text)

        yield Norm({
            'text': full_text,
            'link': response.meta['link'],
            'html': response.text
        })
예제 #4
0
    def parse_details(self, response):
        def extract_with_css(query):
            return response.css(query)

        content = extract_with_css('p.western').extract()
        content = ''.join(content)

        soup = BeautifulSoup(content, 'html.parser')

        yield Norm({
            'published_at': date.today().strftime('%Y-%m-%d'),
            'text': soup.get_text(),
            'type': dict(simple=response.meta['type']),
            'link': response.meta['link'],
            'html': response.text
        })
예제 #5
0
    def parse_norms(self, response):
        def extract_with_css(query):
            return response.css(query)

        urls = extract_with_css('tr.texto_resumen_BO a::attr(href)').extract(
        )  # TODO: check expressions

        for url in urls:
            yield SplashRequest(url=url,
                                callback=self.parse_details,
                                meta={
                                    'type': Norm.get_type_from_text(url),
                                    'date': response.meta['date']
                                },
                                endpoint='execute',
                                args={
                                    'lua_source': self.lua_script,
                                })
예제 #6
0
    def parse(self, response):
        def extract_with_css(query):
            return response.css(query)

        urls = extract_with_css('a::attr(href)').re(
            r'ver.*')  # TODO: check expression
        urls = list(map(response.urljoin, urls))

        for url in urls:
            yield SplashRequest(url=url,
                                callback=self.parse_details,
                                endpoint='execute',
                                args={
                                    'lua_source': self.lua_script,
                                },
                                meta={
                                    'link': url,
                                    'type': Norm.get_type_from_text(url),
                                })
예제 #7
0
    def parse_details(self, response):
        def extract_with_css(query):
            return response.css(query)

        html = extract_with_css('body').extract_first()

        full_text = BeautifulSoup(html, 'html.parser').get_text()

        lines = full_text.splitlines()  # List of HTML text lines

        norms = utils.split_list_by_sep(lines, '__')

        norms = list(
            map(lambda l: [' '.join(l)],
                norms))  # A list of separated norms from the same source

        for norm in norms:
            yield Norm({
                'published_at': response.meta['date'],
                'text': norm[0],
                'type': dict(simple=response.meta['type'])
            })
예제 #8
0
    def parse_norm(self, response):
        # print('Entered parse_norm')
        published_at = self.extract_with_css(
            response, 'span.meta-date::text').extract_first()
        type = self.extract_with_css(
            response, 'div.main-content h1.entry-title::text').extract_first()
        pdf_link = self.extract_with_css(response,
                                         'p.embed_download a::attr(href)')
        if len(pdf_link) == 1:
            # extract text from PDF
            # print('\nExtract text from PDF...')
            res_name = '../ext_data/normatives/municipal/san-lorenzo/datasets/pdf/' + response.meta[
                'link'].rsplit('/', 2)[-2] + '.pdf'
            # print('res_name', res_name)
            pdf_name = pdf_link.extract_first()
            pdf_name = iri_to_uri(pdf_name)
            # print('pdf_name', pdf_name)
            urllib.request.urlretrieve(pdf_name, res_name)
            text = textract.process(res_name).decode("utf-8")
            # print('Done!\n')

        else:
            # extract plain-text
            # print('\nExtract text from HTML...')
            html = self.extract_with_css(response,
                                         'div.main-content').extract_first()
            soup = BeautifulSoup(html, 'html.parser')
            text = soup.get_text()
            # print('Done!\n')
        yield Norm({
            'published_at': published_at,
            'type': dict(full=type),
            'text': text,
            'link': response.meta['link'],
            'html': response.text
        })
        print('Finished parse_norm')
예제 #9
0
    def parse_norm(self, response):
        meta_date = self.extract_with_css(
            response, 'span.meta-date::text').extract_first()
        today = date.today().strftime('%Y-%m-%d')

        # print(meta_date)
        def date_from_en_to_es(m):
            split = m.split()

            def translate(arg):
                arg = arg.lower()
                if arg == 'enero':
                    return 'jan'
                elif arg == 'febrero':
                    return 'feb'
                elif arg == 'marzo':
                    return 'mar'
                elif arg == 'abril':
                    return 'apr'
                elif arg == 'mayo':
                    return 'may'
                elif arg == 'junio':
                    return 'jun'
                elif arg == 'julio':
                    return 'jul'
                elif arg == 'agosto':
                    return 'aug'
                elif (arg == 'septiembre') | (arg == 'setiembre'):
                    return 'sep'
                elif arg == 'octubre':
                    return 'oct'
                elif arg == 'noviembre':
                    return 'nov'
                elif arg == 'diciembre':
                    return 'dec'
                else:
                    return 'None'

            split[0] = translate(split[0])
            date = ' '.join(split)
            return date

        meta_date = parser.parse(date_from_en_to_es(meta_date))
        meta_date = meta_date.strftime('%Y-%m-%d')
        # print(meta_date)

        if meta_date == today:
            # crawl new norm
            # print('Entered parse_norm')
            type = self.extract_with_css(
                response,
                'div.main-content h1.entry-title::text').extract_first()
            pdf_link = self.extract_with_css(response,
                                             'p.embed_download a::attr(href)')

            if len(pdf_link) == 1:
                # extract text from PDF
                # print('\nExtract text from PDF...')
                res_name = os.getenv(
                    'NORMATIVES_MUNICIPAL_PATH'
                ) + '/datasets/pdf/' + response.meta['link'].rsplit(
                    '/', 2)[-2] + '.pdf'
                # print('res_name', res_name)
                pdf_name = pdf_link.extract_first()
                pdf_name = iri_to_uri(pdf_name)
                # print('pdf_name', pdf_name)
                urllib.request.urlretrieve(pdf_name, res_name)
                text = textract.process(res_name).decode("utf-8")
                # print('Done!\n')

            else:
                # extract plain-text
                # print('\nExtract text from HTML...')
                html = self.extract_with_css(
                    response, 'div.main-content').extract_first()
                soup = BeautifulSoup(html, 'html.parser')
                text = soup.get_text()
                # print('Done!\n')

            yield Norm({
                'published_at': meta_date,
                'type': dict(full=type),
                'text': text,
                'link': response.meta['link'],
                'html': response.text
            })