print len(users), "duplicated users found with", len( duplicated), "total ci unique" total_users = users.values() total_names = set(x[0] for x in total_users) delete = set(total_names) # Exclude any user who has ever submitted a journal from deletion cursor.execute("SELECT DISTINCT ON (submitted_by) submitted_by FROM journals") journaled = set(x[0] for x in cursor.fetchall()) delete -= journaled # Exclude any user who is assigned a role on a package cursor.execute("SELECT DISTINCT ON (user_name) user_name FROM roles") roles = set(x[0] for x in cursor.fetchall()) delete -= roles # Exclude any user who has logged in cursor.execute( "SELECT DISTINCT ON (name) name FROM users WHERE last_login != NULL") logged_in = set(x[0] for x in cursor.fetchall()) delete -= logged_in if delete: cursor.execute("DELETE FROM users WHERE name in %s", (tuple(delete), )) store.commit() store.close()
accepted_modes = { "pypi-scrape-crawl": ["pypi-scrape-crawl", "pypi-scrape", "pypi-explicit"], "pypi-scrape": ["pypi-scrape", "pypi-explicit"], "pypi-explicit": ["pypi-explicit"], } store.open() for desired_mode, names in data.iteritems(): if desired_mode == "pypi-scrape-crawl": continue # We don't need to do any processing for pypi-scrape-crawl for name in names: packages = store.find_package(name) if not packages: continue # This doesn't exist assert safe_name(name).lower() == safe_name(packages[0]).lower() name = packages[0] current_mode = store.get_package_hosting_mode(name) if current_mode not in accepted_modes[desired_mode]: store.set_package_hosting_mode(name, desired_mode) processed[desired_mode].add(name) store.commit() with open("migrated.pkl", "wb") as pkl: pickle.dump(processed, pkl)