def do_import_languages(file_data: List[dict]): imported = set() print("Importing languages ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') classifiers = info.get('classifiers') for c in classifiers: if 'Programming Language' not in c: continue original = c c = c.replace('Implementation ::', '').replace('::', ':') text = c parts = c.split(':') if len(parts) > 1: text = ' '.join(parts[-2:]).strip().replace(' ', ' ') if text not in imported: imported.add(text) session: Session = DbSession.factory() lang = ProgrammingLanguage() lang.description = original lang.id = text session.add(lang) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def do_summary(): session = DbSession.factory() print("Final numbers:") print("Users: {:,}".format(session.query(User).count())) print("Packages: {:,}".format(session.query(Package).count())) print("Releases: {:,}".format(session.query(Release).count())) print("Maintainers: {:,}".format(session.query(Maintainer).count())) print("Languages: {:,}".format(session.query(ProgrammingLanguage).count())) print("Licenses: {:,}".format(session.query(License).count()))
def load_package(data: dict, user_lookup: Dict[str, User]): try: info = data.get('info', {}) p = Package() p.id = data.get('package_name', '').strip() if not p.id: return p.author = info.get('author') p.author_email = info.get('author_email') releases = build_releases(p.id, data.get("releases", {})) if releases: p.created_date = releases[0].created_date maintainers_lookup = get_email_and_name_from_text(info.get('maintainer'), info.get('maintainer_email')) maintainers = [] for email, name in maintainers_lookup.items(): user = user_lookup.get(email) if not user: continue m = Maintainer() m.package_id = p.id m.user_id = user.id maintainers.append(m) p.summary = info.get('summary') p.description = info.get('description') p.home_page = info.get('home_page') p.docs_url = info.get('docs_url') p.package_url = info.get('package_url') p.author = info.get('author') p.author_email = info.get('author_email') p.license = detect_license(info.get('license')) session = DbSession.factory() session.add(p) session.add_all(releases) if maintainers: session.add_all(maintainers) session.commit() session.close() except OverflowError: # What the heck, people just putting fake data in here # Size is terabytes... pass except Exception: raise
def do_user_import(user_lookup: Dict[str, str]) -> Dict[str, User]: print("Importing users ... ", flush=True) with progressbar.ProgressBar(max_value=len(user_lookup)) as bar: for idx, (email, name) in enumerate(user_lookup.items()): session: Session = DbSession.factory() session.expire_on_commit = False user = User() user.email = email user.name = name session.add(user) session.commit() bar.update(idx) print() sys.stderr.flush() sys.stdout.flush() session: Session = DbSession.factory() return {u.email: u for u in session.query(User)}
def main(): init_db() session = DbSession.factory() user_count = session.query(User).count() session.close() if user_count == 0: file_data = do_load_files() users = find_users(file_data) db_users = do_user_import(users) do_import_packages(file_data, db_users) do_import_languages(file_data) do_import_licenses(file_data) do_summary()
def do_import_licenses(file_data: List[dict]): imported = set() print("Importing licenses ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') license_text = detect_license(info.get('license')) if license_text and license_text not in imported: imported.add(license_text) session: Session = DbSession.factory() package_license = License() package_license.id = license_text package_license.description = info.get('license') session.add(package_license) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()