Пример #1
0
 def update_datasets(self):
     os.makedirs(self.path, exist_ok=True)
     chamber_of_deputies = Dataset(self.path)
     chamber_of_deputies.fetch()
     chamber_of_deputies.translate()
     chamber_of_deputies.clean()
     fetch(self.COMPANIES_DATASET, self.path)
Пример #2
0
 def update_datasets(self):
     os.makedirs(self.path, exist_ok=True)
     ceap = CEAPDataset(self.path)
     ceap.fetch()
     ceap.convert_to_csv()
     ceap.translate()
     ceap.clean()
     fetch(self.COMPANIES_DATASET, self.path)
Пример #3
0
 def test_fetch(self, datasets):
     fetch('file.xz', 'test')
     datasets.assert_called_once_with('test')
     datasets.return_value.downloader.download.assert_called_once_with(
         'file.xz')
Пример #4
0
# coding: utf-8

# # Um mês depois do primeiro mutirão
#
# https://datasciencebr.com/um-m%C3%AAs-depois-do-primeiro-mutir%C3%A3o-369975af4bb5

# In[1]:

import numpy as np
import pandas as pd
from serenata_toolbox.datasets import fetch

fetch('2016-12-06-reimbursements.xz', '../data')
reimbursements = pd.read_csv('../data/2016-12-06-reimbursements.xz',
                             dtype={
                                 'document_number': np.str,
                                 'year': np.str
                             },
                             low_memory=False)

# In[2]:

import os.path
import urllib.request
import zipfile

inbox_url = 'https://github.com/datasciencebr/serenata-de-amor-inbox/archive/master.zip'
inbox_filepath = '/tmp/master.zip'
if not os.path.exists(inbox_filepath):
    urllib.request.urlretrieve(inbox_url, inbox_filepath)
 def test_fetch(self, datasets):
     fetch('file.xz', 'test')
     datasets.assert_called_once_with('test')
     datasets.return_value.downloader.download.assert_called_once_with('file.xz')
Пример #6
0
                    'DF': 'distrito_federal'}

census_link = "ftp.ibge.gov.br/Censos/Censo_Demografico_2010/resultados/total_populacao_{}.zip"


# ## Gathering cities with @cuducos Brazilian Cities script
# 
# @cuducos had already made a script with all Brazilian Cities and its code and state associated, here in [this repository](https://github.com/cuducos/brazilian-cities).
# 
# We checked and it is the best way to get the cities in the right way.

# In[10]:

from serenata_toolbox.datasets import fetch

fetch('2017-05-22-brazilian-cities.csv', '../data')


# In[11]:

brazilian_cities = pd.read_csv('../data/2017-05-22-brazilian-cities.csv')
brazilian_cities.head()


# In[12]:

brazilian_cities.shape


# ## Normalizing its form
# 
# * Where expenses with a total net value equal or higher than 100 BRL
# * In which congresspeople from the 2015 term have expend public money
#
# The set of cities was taken [random sample that sounded promosing](https://twitter.com/cuducos/status/840882495868530688)… but hold your horses: further analysis is disapointing… let's get started.

# In[1]:

import numpy as np
import pandas as pd
from serenata_toolbox.datasets import fetch

DTYPE = dict(cnpj=np.str, cnpj_cpf=np.str)

# In[2]:

fetch('2017-04-21-sex-place-distances.xz', '../data')

# In[3]:

companies = pd.read_csv('../data/2016-09-03-companies.xz',
                        dtype=DTYPE,
                        low_memory=False)
companies.cnpj = companies.cnpj.str.replace(r'\D', '')
companies.shape

# In[4]:

sex_places = pd.read_csv('../data/2017-04-21-sex-place-distances.xz',
                         dtype=DTYPE)
sex_places.shape
Пример #8
0
 def update_companies(self):
     self.log.info('Updating companies')
     os.makedirs(self.path, exist_ok=True)
     fetch(self.COMPANIES_DATASET, self.path)
# Note: remember to correct prices with an inflation index (e.g. IPCA).

# In[1]:

get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

plt.rcParams['figure.figsize'] = (20, 10)

# In[2]:

from serenata_toolbox.datasets import fetch

fetch('2016-11-19-reimbursements.xz', '../data')
fetch('2016-09-03-companies.xz', '../data')
fetch('2016-11-29-yelp-companies.xz', '../data')
fetch('2016-12-02-foursquare-companies.xz', '../data')

# In[3]:

import numpy as np
import pandas as pd

dataset = pd.read_csv('../data/2016-11-19-reimbursements.xz',
                      dtype={
                          'applicant_id': np.str,
                          'cnpj_cpf': np.str,
                          'congressperson_id': np.str,
                          'subquota_number': np.str
Пример #10
0
# coding: utf-8

# # Invalid CNPJ or CPF from Federal Senate CEAP
#
# `cnpj_cpf` is the column identifying the company or individual who received the payment made by the congressperson. Having this value empty should mean that it's an expense made outside Brazil, with a company (or person) without a Brazilian ID.

# In[1]:

import numpy as np
import pandas as pd

from serenata_toolbox.datasets import fetch

fetch('2017-05-22-federal-senate-reimbursements.xz', '../data/')

# In[2]:

dataset = pd.read_csv('../data/2017-05-22-federal-senate-reimbursements.xz',
                      converters={'cnpj_cpf': np.str},
                      encoding='utf-8')

# In[3]:

dataset = dataset[dataset['cnpj_cpf'].notnull()]
dataset.head()

# In[4]:

from pycpfcnpj import cpfcnpj

Пример #11
0
 def update_companies(self):
     self.log.info('Updating companies')
     os.makedirs(self.path, exist_ok=True)
     fetch(self.COMPANIES_DATASET, self.path)
# coding: utf-8

# # Expenses in closed companies
# Recently we found out that there are many companies that are already closed or out of service, we are aiming to find if there are expenses made after the company situation as other than open.

# In[1]:

import pandas as pd
import numpy as np
from serenata_toolbox.datasets import fetch

fetch('2016-09-03-companies.xz', '../data')
fetch('2016-11-19-reimbursements.xz', '../data')

# In[2]:

companies = pd.read_csv('../data/2016-09-03-companies.xz', low_memory=False)
reimbursements = pd.read_csv('../data/2016-11-19-reimbursements.xz',
                             dtype={
                                 'applicant_id': np.str,
                                 'cnpj_cpf': np.str,
                                 'congressperson_id': np.str,
                                 'subquota_number': np.str
                             },
                             low_memory=False)

# ## Formatting
# Formatting companies situation_date and reimbursements issue_date columns to correct date format (will be needed for a query later), and formatting the companies cpnj to a format without dash and dots.

# In[3]:
Пример #13
0
 def fetch(self):
     datasets.fetch(self.COMPANIES_FILE, self.data_path)
     datasets.fetch(self.CONGRESSPEOPLE_FILE, self.data_path)
     datasets.fetch(self.SOCIAL_ACCOUNTS_FILE, self.data_path)