def fetch_latest_backup(destination_path, force_all=False): datasets = Datasets(destination_path) if force_all: files = datasets.downloader.LATEST else: files = tuple(f for f in datasets.downloader.LATEST if not os.path.exists(os.path.join(destination_path, f))) if not files: log.info( 'You already have all the latest datasets! Nothing to download.') return datasets.downloader.download(files)
def fetch_session_start_times(data_dir, pivot, session_dates): """ :param data_dir: (str) directory in which the output file will be saved :param pivot: (int) congressperson document to use as a pivot for scraping the data :param session_dates: (list) datetime objects to fetch the start times for """ session_start_times = SessionStartTimesDataset() df = session_start_times.fetch(pivot, session_dates) save_to_csv(df, data_dir, "session-start-times") log.info("Dates requested:", len(session_dates)) found = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S").dt.date.unique() log.info("Dates found:", len(found)) return df
def read_csv(self, name): filepath = os.path.join(self.path, name) log.info('Loading {}…'.format(name)) dtype = { 'applicant_id': np.str, 'batch_number': np.str, 'cnpj_cpf': np.str, 'congressperson_document': np.str, 'congressperson_id': np.str, 'document_id': np.str, 'document_number': np.str, 'document_type': np.str, 'leg_of_the_trip': np.str, 'passenger': np.str, 'reimbursement_number': np.str, 'subquota_group_description': np.str, 'subquota_group_id': np.str, 'subquota_number': np.str, 'term_id': np.str, } return pd.read_csv(filepath, dtype=dtype)
def _try_fetch_xml(self, attempts, url): while attempts > 0: try: return urllib.request.urlopen(url, data=None, timeout=10) except urllib.error.HTTPError as err: log.error("HTTP Error", err.code, "when loading URL", url) # 500 seems to be the error code for "no data found for the # params provided" if err.code == 500: log.info("Skipping [HTTP Status 500] {}".format(url)) return None time.sleep(self.sleep_interval / 2) attempts -= 1 if attempts > 0: log.info("Trying again", attempts) else: log.error("FAIL {}".format(url)) except socket.error as socketerror: log.error("Socket error:", socketerror) time.sleep(self.sleep_interval * 10) attempts -= 1 if attempts > 0: log.info("Trying again", attempts) else: log.error("FAIL {}".format(url))
def write_reimbursement_file(self, receipts): log.info('Casting changes to a new DataFrame…') df = pd.DataFrame(data=receipts) log.info('Writing it to file…') filepath = os.path.join(self.path, self.FILE_BASE_NAME) df.to_csv(filepath, **self.CSV_PARAMS) log.info('Done.')
def __init__(self): self.credentials = None self.client = None self.config = find_config(self.CONFIG) if not self.config_exists: log.info('Could not find {} file.'.format(self.CONFIG)) log.info('You need Amazon section in it to interact with S3') log.info('(Check config.ini.example if you need a reference.)') return settings = configparser.RawConfigParser() settings.read(self.config) self.settings = partial(settings.get, 'Amazon') try: self.credentials = { 'aws_access_key_id': self.settings('AccessKey'), 'aws_secret_access_key': self.settings('SecretKey'), 'region_name': self.settings('Region') } # friendly user message warning about old config.ini version region = self.credentials.get('region_name', '') if region and region.startswith('s3-'): msg = ( 'It looks like you have an old version of the config.ini ' 'file. We do not need anymore the service (s3) appended ' 'to the region (sa-east-1). Please update your config.ini ' 'replacing regions like `s3-sa-east-1` by `sa-east-1`.') log.info(msg) except configparser.NoSectionError: msg = ('You need an Amazon section in {} to interact with S3 ' '(Check config.ini.example if you need a reference.)') log.info(msg.format(self.CONFIG))
def fetch_deputies(data_dir): """ :param data_dir: (str) directory in which the output file will be saved """ deputies = DeputiesDataset() df = deputies.fetch() save_to_csv(df, data_dir, "deputies") holders = df.condition == 'Holder' substitutes = df.condition == 'Substitute' log.info("Total deputies:", len(df)) log.info("Holder deputies:", len(df[holders])) log.info("Substitute deputies:", len(df[substitutes])) return df
def fetch_presences(data_dir, deputies, date_start, date_end): """ :param data_dir: (str) directory in which the output file will be saved :param deputies: (pandas.DataFrame) a dataframe with deputies data :param date_start: (str) a date in the format dd/mm/yyyy :param date_end: (str) a date in the format dd/mm/yyyy """ presences = PresencesDataset() df = presences.fetch(deputies, date_start, date_end) save_to_csv(df, data_dir, "presences") log.info("Presence records:", len(df)) log.info("Records of deputies present on a session:", len(df[df.presence == 'Present'])) log.info("Records of deputies absent from a session:", len(df[df.presence == 'Absent'])) return df
def group(self, receipts): log.info( 'Dropping rows without document_value or reimbursement_number…') subset = ('document_value', 'reimbursement_number') receipts = receipts.dropna(subset=subset) groupby_keys = ('year', 'applicant_id', 'document_id') receipts = receipts.dropna(subset=subset + groupby_keys) receipts = receipts[receipts['document_value'] != 0] receipts = receipts[receipts['reimbursement_number'] != '0'] receipts = receipts[receipts['year'] != 0] receipts = receipts[receipts['applicant_id'] != '0'] receipts = receipts[receipts['document_id'] != '0'] log.info('Grouping dataset by applicant_id, document_id and year…') grouped = receipts.groupby(groupby_keys) log.info('Gathering all reimbursement numbers together…') numbers = self.aggregate(grouped, 'reimbursement_number', 'reimbursement_numbers', lambda x: ', '.join(set(x))) log.info('Summing all net values together…') net_total = self.aggregate(grouped, 'net_value', 'total_net_value', np.sum) log.info('Summing all reimbursement values together…') total = self.aggregate(grouped, 'reimbursement_value', 'reimbursement_value_total', np.sum) log.info('Generating the new dataset…') final = pd.merge(pd.merge(pd.merge(total, net_total, on=groupby_keys), numbers, on=groupby_keys), receipts, on=groupby_keys) final = final.drop_duplicates(subset=groupby_keys) final.rename(columns={ 'net_value': 'net_values', 'reimbursement_value': 'reimbursement_values' }, inplace=True) final = final.drop('reimbursement_number', 1) return final
def receipts(self): log.info('Merging all datasets…') datasets = ["reimbursements-{}.xz".format(n) for n in self.years] data = (self.read_csv(name) for name in datasets) return pd.concat(data)