def parse_details(self, response): full_text = self.extract_with_css( response, 'div.avisoContenido div#detalleAviso').extract_first() full_text_soup = BeautifulSoup(full_text, 'html.parser') title = self.extract_with_css( response, 'div.avisoContenido div#detalleAviso div#tituloDetalleAviso' ).extract_first() title_soup = BeautifulSoup(title, 'html.parser') if self.extract_with_css( response, 'div.avisoContenido div#detalleAviso div#tituloDetalleAviso' ).extract_first(): simple_type = Norm.get_type_from_text( self.extract_with_css( response, 'div.avisoContenido div#detalleAviso div#tituloDetalleAviso' ).extract_first()) else: simple_type = None yield Norm({ 'published_at': response.meta['date'], 'title': title_soup.get_text(), 'text': full_text_soup.get_text(), 'type': dict(simple=simple_type), 'link': response.meta['link'], 'html': response.text })
def parse_saij(self, response): full_text = self.extract_with_css( response, 'div.resultado-busqueda div#div-texto').extract_first() full_text = BeautifulSoup(full_text, 'html.parser').get_text() abstract = self.extract_with_css( response, 'div.resultado-busqueda div#div-texto div#texto-norma-container' ).extract_first() abstract = BeautifulSoup(abstract, 'html.parser').get_text() yield Norm({ 'title': self.extract_with_css( response, 'div.resultado-busqueda li.result-item dd.tit-resultado h1.p-titulo::text' ).extract_first(), 'text': full_text, 'abstract': abstract, 'link': response.meta['link'], 'html': response.text })
def parse_infoleg(self, response): full_text = response.xpath('//text()').extract() full_text = ' '.join(full_text) yield Norm({ 'text': full_text, 'link': response.meta['link'], 'html': response.text })
def parse_details(self, response): def extract_with_css(query): return response.css(query) content = extract_with_css('p.western').extract() content = ''.join(content) soup = BeautifulSoup(content, 'html.parser') yield Norm({ 'published_at': date.today().strftime('%Y-%m-%d'), 'text': soup.get_text(), 'type': dict(simple=response.meta['type']), 'link': response.meta['link'], 'html': response.text })
def parse_norms(self, response): def extract_with_css(query): return response.css(query) urls = extract_with_css('tr.texto_resumen_BO a::attr(href)').extract( ) # TODO: check expressions for url in urls: yield SplashRequest(url=url, callback=self.parse_details, meta={ 'type': Norm.get_type_from_text(url), 'date': response.meta['date'] }, endpoint='execute', args={ 'lua_source': self.lua_script, })
def parse(self, response): def extract_with_css(query): return response.css(query) urls = extract_with_css('a::attr(href)').re( r'ver.*') # TODO: check expression urls = list(map(response.urljoin, urls)) for url in urls: yield SplashRequest(url=url, callback=self.parse_details, endpoint='execute', args={ 'lua_source': self.lua_script, }, meta={ 'link': url, 'type': Norm.get_type_from_text(url), })
def parse_details(self, response): def extract_with_css(query): return response.css(query) html = extract_with_css('body').extract_first() full_text = BeautifulSoup(html, 'html.parser').get_text() lines = full_text.splitlines() # List of HTML text lines norms = utils.split_list_by_sep(lines, '__') norms = list( map(lambda l: [' '.join(l)], norms)) # A list of separated norms from the same source for norm in norms: yield Norm({ 'published_at': response.meta['date'], 'text': norm[0], 'type': dict(simple=response.meta['type']) })
def parse_norm(self, response): # print('Entered parse_norm') published_at = self.extract_with_css( response, 'span.meta-date::text').extract_first() type = self.extract_with_css( response, 'div.main-content h1.entry-title::text').extract_first() pdf_link = self.extract_with_css(response, 'p.embed_download a::attr(href)') if len(pdf_link) == 1: # extract text from PDF # print('\nExtract text from PDF...') res_name = '../ext_data/normatives/municipal/san-lorenzo/datasets/pdf/' + response.meta[ 'link'].rsplit('/', 2)[-2] + '.pdf' # print('res_name', res_name) pdf_name = pdf_link.extract_first() pdf_name = iri_to_uri(pdf_name) # print('pdf_name', pdf_name) urllib.request.urlretrieve(pdf_name, res_name) text = textract.process(res_name).decode("utf-8") # print('Done!\n') else: # extract plain-text # print('\nExtract text from HTML...') html = self.extract_with_css(response, 'div.main-content').extract_first() soup = BeautifulSoup(html, 'html.parser') text = soup.get_text() # print('Done!\n') yield Norm({ 'published_at': published_at, 'type': dict(full=type), 'text': text, 'link': response.meta['link'], 'html': response.text }) print('Finished parse_norm')
def parse_norm(self, response): meta_date = self.extract_with_css( response, 'span.meta-date::text').extract_first() today = date.today().strftime('%Y-%m-%d') # print(meta_date) def date_from_en_to_es(m): split = m.split() def translate(arg): arg = arg.lower() if arg == 'enero': return 'jan' elif arg == 'febrero': return 'feb' elif arg == 'marzo': return 'mar' elif arg == 'abril': return 'apr' elif arg == 'mayo': return 'may' elif arg == 'junio': return 'jun' elif arg == 'julio': return 'jul' elif arg == 'agosto': return 'aug' elif (arg == 'septiembre') | (arg == 'setiembre'): return 'sep' elif arg == 'octubre': return 'oct' elif arg == 'noviembre': return 'nov' elif arg == 'diciembre': return 'dec' else: return 'None' split[0] = translate(split[0]) date = ' '.join(split) return date meta_date = parser.parse(date_from_en_to_es(meta_date)) meta_date = meta_date.strftime('%Y-%m-%d') # print(meta_date) if meta_date == today: # crawl new norm # print('Entered parse_norm') type = self.extract_with_css( response, 'div.main-content h1.entry-title::text').extract_first() pdf_link = self.extract_with_css(response, 'p.embed_download a::attr(href)') if len(pdf_link) == 1: # extract text from PDF # print('\nExtract text from PDF...') res_name = os.getenv( 'NORMATIVES_MUNICIPAL_PATH' ) + '/datasets/pdf/' + response.meta['link'].rsplit( '/', 2)[-2] + '.pdf' # print('res_name', res_name) pdf_name = pdf_link.extract_first() pdf_name = iri_to_uri(pdf_name) # print('pdf_name', pdf_name) urllib.request.urlretrieve(pdf_name, res_name) text = textract.process(res_name).decode("utf-8") # print('Done!\n') else: # extract plain-text # print('\nExtract text from HTML...') html = self.extract_with_css( response, 'div.main-content').extract_first() soup = BeautifulSoup(html, 'html.parser') text = soup.get_text() # print('Done!\n') yield Norm({ 'published_at': meta_date, 'type': dict(full=type), 'text': text, 'link': response.meta['link'], 'html': response.text })