def get_news(link: str) -> None: print(link) content = requests.get(link) filename = link.split('/')[-2] if link.endswith('/') else link.split( '/')[-1] s3_handler.upload_file(content.text, 'teste', filename + '.html')
def normalize_news(doc): print(doc['Key']) content = s3_handler.get_file(doc['Key']) obj = json.loads(content) obj['news']['body_normalized'] = normalize(obj['news']['body']) filename = get_filename(doc['Key']) s3_handler.upload_file(obj, '00/camara-news/json', filename)
def getNews(url): response = requests.get(url) # success ? if response.status_code == 200: data = response.text filename = url.split('/')[-2] if url.endswith('/') else url.split( '/')[-1] filename = '%s.html' % filename s3_handler.upload_file(data, '06/html', filename)
def get_detalhe_deputado(deputado_id): url = f'{url_deputados}/{deputado_id}' response = requests.get(url) if response.status_code == 200: deputado = json.loads(response.text) s3_handler.upload_file(deputado['dados'], '00/dadosabertos-camara-deputados', str(deputado['dados']['id']) + '.json')
def normalize_deputados(doc): print(doc['Key']) content = s3_handler.get_file(doc['Key']) obj = json.loads(content) obj['nomeCivil_normalized'] = normalize(obj['nomeCivil']) obj['ultimoStatus']['nome_normalized'] = normalize( obj['ultimoStatus']['nome']) obj['ultimoStatus']['siglaPartido_normalized'] = normalize( obj['ultimoStatus']['siglaPartido']) filename = get_filename(doc['Key']) s3_handler.upload_file(obj, '00/dadosabertos-camara-deputados', filename)
date = soup.find('p', attrs={'class': 'g-artigo__data-hora'}).text.strip() article = soup.find('div', attrs={'class': 'js-article-read-more'}) references = [{'link': reference['href'], 'label':reference.text} for reference in article.find_all('a')] news_link = soup.find('meta', attrs={'property': 'og:url'})['content'] news = {'body': article.text, 'link': news_link} category = soup.find('span', {'class': 'g-artigo__categoria'}).text proposals = [{'text': li.a.text, 'link': li.a['href']} for li in soup.find_all('li', {'class': 'integra-lista__item'})] data = { 'title': title, 'date': date, 'references': references, 'news': news, 'category': category, 'proposals': proposals } filename = news_link.split( '/')[-2] if news_link.endswith('/') else news_link.split('/')[-1] s3_handler.upload_file(data, '00/camara-news/json', filename+'.json') load_files()
def getDetailsPerson(id): url = '%s/%s' % (urlApi, id) response = requests.get(url) # success ? if response.status_code == 200: obj = json.loads(response.text) return obj return {} def getAllPersons(): ''' Consume API - DadosAbertos (Deputados) ''' response = requests.get(urlApi) # success ? if response.status_code == 200: obj = json.loads(response.text)['dados'] ids = [ob['id'] for ob in obj] return ids # Get ids from API ids = getAllPersons() # Iterate into ids for id in ids: data = getDetailsPerson(id) filename = '%s.json' % id s3_handler.upload_file(data, '06/api', filename)