def do_import_languages(file_data: List[dict]): imported = set() print("Importing languages ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') classifiers = info.get('classifiers') for c in classifiers: if 'Programming Language' not in c: continue original = c c = c.replace('Implementation ::', '').replace('::', ':') text = c parts = c.split(':') if len(parts) > 1: text = ' '.join(parts[-2:]).strip().replace(' ', ' ') if text not in imported: imported.add(text) session = db_session.create_session() lang = ProgrammingLanguage() lang.description = original lang.id = text session.add(lang) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def insert_a_package(): p = Package() p.id = input('Package id / name: ').strip().lower() p.summary = input("Package summary: ").strip() p.author_name = input("Author: ").strip() p.license = input("License: ").strip() print("Release 1:") r = Release() r.major_ver = int(input("Major version: ")) r.minor_ver = int(input("Minor version: ")) r.build_ver = int(input("Build version: ")) r.size = int(input("Size in bytes: ")) p.releases.append(r) print("Release 2:") r = Release() r.major_ver = int(input("Major version: ")) r.minor_ver = int(input("Minor version: ")) r.build_ver = int(input("Build version: ")) r.size = int(input("Size in bytes: ")) p.releases.append(r) session = db_session.create_session() session.add(p) session.commit()
def do_summary(): session = db_session.create_session() print("Final numbers:") print("Users: {:,}".format(session.query(User).count())) print("Packages: {:,}".format(session.query(Package).count())) print("Releases: {:,}".format(session.query(Release).count())) print("Maintainers: {:,}".format(session.query(Maintainer).count())) print("Languages: {:,}".format(session.query(ProgrammingLanguage).count())) print("Licenses: {:,}".format(session.query(License).count()))
def load_package(data: dict, user_lookup: Dict[str, User]): try: info = data.get('info', {}) p = Package() p.id = data.get('package_name', '').strip() if not p.id: return p.author = info.get('author') p.author_email = info.get('author_email') releases = build_releases(p.id, data.get("releases", {})) if releases: p.created_date = releases[0].created_date maintainers_lookup = get_email_and_name_from_text( info.get('maintainer'), info.get('maintainer_email')) maintainers = [] for email, name in maintainers_lookup.items(): user = user_lookup.get(email) if not user: continue m = Maintainer() m.package_id = p.id m.user_id = user.id maintainers.append(m) p.summary = info.get('summary') p.description = info.get('description') p.home_page = info.get('home_page') p.docs_url = info.get('docs_url') p.package_url = info.get('package_url') p.author = info.get('author') p.author_email = info.get('author_email') p.license = detect_license(info.get('license')) session = db_session.create_session() session.add(p) session.add_all(releases) if maintainers: session.add_all(maintainers) session.commit() session.close() except OverflowError: # What the heck, people just putting fake data in here # Size is terabytes... pass except Exception: raise
def get_latest_releases(limit=10) -> List[Release]: session = db_session.create_session() releases = session.query(Release). \ options(sqlalchemy.orm.joinedload(Release.package)). \ order_by(Release.created_date.desc()). \ limit(limit). \ all() session.close() return releases
def do_user_import(user_lookup: Dict[str, str]) -> Dict[str, User]: print("Importing users ... ", flush=True) with progressbar.ProgressBar(max_value=len(user_lookup)) as bar: for idx, (email, name) in enumerate(user_lookup.items()): session = db_session.create_session() session.expire_on_commit = False user = User() user.email = email user.name = name session.add(user) session.commit() bar.update(idx) print() sys.stderr.flush() sys.stdout.flush() session = db_session.create_session() return {u.email: u for u in session.query(User)}
def get_package_by_id(package_id: str) -> Optional[Package]: if not package_id: return None package_id = package_id.strip().lower() session = db_session.create_session() package = session.query(Package) \ .options(sqlalchemy.orm.joinedload(Package.releases)) \ .filter(Package.id == package_id) \ .first() session.close() return package
def main(): init_db() session = db_session.create_session() user_count = session.query(User).count() session.close() if user_count == 0: file_data = do_load_files() users = find_users(file_data) db_users = do_user_import(users) do_import_packages(file_data, db_users) do_import_languages(file_data) do_import_licenses(file_data) do_summary()
def do_import_licenses(file_data: List[dict]): imported = set() print("Importing licenses ... ", flush=True) with progressbar.ProgressBar(max_value=len(file_data)) as bar: for idx, p in enumerate(file_data): info = p.get('info') license_text = detect_license(info.get('license')) if license_text and license_text not in imported: imported.add(license_text) session = db_session.create_session() package_license = License() package_license.id = license_text package_license.description = info.get('license') session.add(package_license) session.commit() bar.update(idx) sys.stderr.flush() sys.stdout.flush()
def get_user_count() -> int: session = db_session.create_session() return session.query(User).count()
def get_release_count() -> int: session = db_session.create_session() return session.query(Release).count()
def get_package_count() -> int: session = db_session.create_session() return session.query(Package).count()