def main(branch='master', push=False, emails_to_notify=[]): creds = json.loads(CREDENTIALS_PATH.read_text()) email_sender = EmailSender(creds['EMAIL_ADDRESS'], creds['EMAIL_PASSWORD']) notifier = Notifier(emails=emails_to_notify, email_sender=email_sender) try: logging.info('Opening and checking the repository...') repo = git.Repo(DATA_REPO) if repo.active_branch.name != branch: raise Exception( 'dataset repository in unexpected branch. Expected: %s. Actual: %s' % (branch, repo.active_branch.name)) logging.info('Checking for new reports...') new_report_paths = download_missing_reports() if not new_report_paths: logging.info('No new reports.') else: logging.info('New reports found: %s', ', '.join(map(str, new_report_paths))) new_dataset_paths = make_single_date_datasets(skip_existing=True) if not new_dataset_paths: logging.info('No new datasets written. Exit.') sys.exit(0) logging.info('New datasets: %s', ', '.join(map(str, new_dataset_paths))) full_dataset_path = make_full_dataset() # git add files_to_add = [ str(path) for path in new_dataset_paths + [full_dataset_path] ] logging.info('Command: git add %s', ' '.join(files_to_add)) repo.index.add(items=files_to_add) # git commit latest_date = max( get_date_from_filename(path.name) for path in new_dataset_paths) commit_msg = COMMIT_TEMPLATE.format(date=latest_date) logging.info('Command: git commit -m %r', commit_msg) repo.index.commit(commit_msg) # git push if push: # TODO: handle push errors refspec = '{0}:{0}'.format(branch) logging.info('Command: git push %s', refspec) repo.remote('origin').push(refspec=refspec) notifier.notify('Pushed new ICCAS datasets', commit_msg) else: notifier.notify('New commit waiting for push', commit_msg) except Exception as exc: notifier.notify('Fatal error', repr(exc)) logging.exception('Exception was raised') raise finally: email_sender.quit()
if not ordered_parts: print("No datasets found in", input_dir) return False dataset = pd.concat(ordered_parts, axis=0) dataset.to_csv(out_path, line_terminator="\n") print("Full dataset written to", out_path) return out_path def get_latest_data_date(dirpath=REPORTS_DATA_DIR, default: str = "") -> str: if not dirpath.exists(): return default dataset_list = list_datasets_by_date(dirpath) return dataset_list[-1][0] if dataset_list else default if __name__ == "__main__": import argparse from download_reports import download_missing_reports parser = argparse.ArgumentParser() parser.add_argument("-o", "--overwrite", action="store_true") args = parser.parse_args() download_missing_reports(after=get_latest_data_date()) new_data_paths = extract_data_from_reports( skip_existing=not args.overwrite) if args.overwrite or new_data_paths: make_dataset()
def make_full_dataset(input_dir=DATA_BY_DATE_DIR, output_dir=FULL_DATASET_DIR): date_path_pairs = list_datasets_by_date(input_dir) if not date_path_pairs: print('No datasets found in', input_dir) return False out_path = get_full_dataset_path(dirpath=output_dir) iccas_by_date = {} for date, path in date_path_pairs: iccas_by_date[date] = pd.read_csv(path, index_col='age_group') full = pd.concat(iccas_by_date.values(), axis=0, keys=iccas_by_date.keys(), names=['date', 'age_group']) output_dir.mkdir(parents=True, exist_ok=True) full.to_csv(out_path) print('Full dataset written to', out_path) return out_path if __name__ == '__main__': from download_reports import download_missing_reports download_missing_reports() make_single_date_datasets() make_full_dataset()