def set_unknown_domains_not_in_hyps(hyps): # find domains in status Started,Paused,Unknown # that are not in hypervisors r_conn = new_rethink_connection() rtable = r.table('domains') status_to_unknown = ['Started', 'Paused', 'Unknown'] l = list( rtable.filter(lambda d: r.expr(status_to_unknown).contains(d['status']) ).filter(lambda d: r.not_( r.expr(hyps).contains(d['hyp_started']))).update({ 'status': 'Unknown' }).run(r_conn)) status_to_stopped = ['Starting', 'CreatingTemplate'] l = list( rtable.filter(lambda d: r.expr(status_to_stopped).contains(d['status']) ).filter(lambda d: r.not_( r.expr(hyps).contains(d['hyp_started']))).update({ 'status': 'Stopped' }).run(r_conn)) close_rethink_connection(r_conn) return l
def get_issues_for_group(self, group_id, query_params): if query_params is None: issues = self.uow.run_list(self.table.get_all(group_id, index="group_id")) return issues # get the group name to prevent slow queries group_name = self.uow.run(self.uow.tables.groups.get(group_id)) if 'name' not in group_name: return QueryResult([], 0) group_name = group_name['name'] # manually map all columns to prevent name conflicts and overwriting return self.uow.apply_query_parameters( self.table.get_all(group_id, index='group_id') .filter(r.row.has_fields('equipment_id')) .eq_join('equipment_id', self.uow.tables.equipment) .map(lambda x: {'id': x['left']['id'], 'group': group_name, 'equipment': x['right']['name'], 'name': x['left']['name'], 'description': x['left']['description']}) .union(self.table.get_all(group_id, index='group_id') .filter(r.not_(r.row.has_fields('equipment_id'))) .map(lambda x: {'id': x['id'], 'group': group_name, 'equipment': 'No Equip.', 'name': x['name'], 'description': x['description']})), query_params)
def get_hyps_with_status(list_status, not_=False, empty=False): r_conn = new_rethink_connection() rtable = r.table('hypervisors') if not_ == True: l = list( rtable.filter({ 'enabled': True }).filter(lambda d: r.not_( r.expr(list_status).contains(d['status']))).run(r_conn)) else: l = list( rtable.filter({ 'enabled': True }).filter(lambda d: r.expr(list_status).contains(d['status'])).run( r_conn)) if empty == True: nostatus = list( rtable.filter({ 'enabled': True }).filter(lambda n: ~n.has_fields('status')).run(r_conn)) l = l + nostatus close_rethink_connection(r_conn) return l
async def create_subscription(user_id, serial, voice): serial_sub = { "id": serial["id"], "excluded_voices": [], "title": serial["title"] } await User.manager.execute( User.manager.table.filter( r.and_(r.row["id"] == user_id, r.not_(r.row["serials"].default( []).contains(serial_sub)))).update({ "serials": r.row["serials"].default([]).append(serial_sub) }))
def get_hyps_with_status(list_status, not_=False, empty=False): r_conn = new_rethink_connection() rtable = r.table('hypervisors') if not_ == True: l = list(rtable.filter({'enabled': True}).filter(lambda d: r.not_(r.expr(list_status). contains(d['status']))). run(r_conn)) else: l = list(rtable.filter({'enabled': True}).filter(lambda d: r.expr(list_status). contains(d['status'])). run(r_conn)) if empty == True: nostatus = list(rtable.filter({'enabled': True}).filter(lambda n: ~n.has_fields('status')).run(r_conn)) l = l + nostatus close_rethink_connection(r_conn) return l
def scrape_plugin_repos(num): """Scrapes the num plugin repos that have been least recently scraped.""" MIN_FORK_USERS = 3 query = r.table('plugin_github_repos').filter({'is_blacklisted': False}) # We don't want to scrape forks that not many people use. query = query.filter( r.not_((r.row['is_fork'] == True) & ( # NOQA r.row['plugin_manager_users'] < MIN_FORK_USERS)), default=True) # Only scrape repos that don't redirect to other ones (probably renamed). query = query.filter(r.row['redirects_to'] == '') # We scrape vim-scripts separately using the batch /users/:user/repos call query = query.filter(r.row['owner'] != 'vim-scripts') query = query.order_by('last_scraped_at').limit(num) repos = query.run(r_conn()) # TODO(david): Print stats at the end: # successfully scraped, # not found, # # redirects, etc. for repo in repos: repo_name = repo['repo_name'] repo_owner = repo['owner'] # Print w/o newline. print " scraping %s/%s ..." % (repo_owner, repo_name), sys.stdout.flush() # Attempt to fetch data about the plugin. res, repo_data = get_api_page('repos/%s/%s' % (repo_owner, repo_name)) # If the API call 404s, then see if the repo has been renamed by # checking for a redirect in a non-API call. if res.status_code == 404: res = requests.head('https://github.com/%s/%s' % (repo_owner, repo_name)) if res.status_code == 301: location = res.headers.get('location') _, redirect_owner, redirect_repo_name = location.rsplit('/', 2) repo['redirects_to'] = '%s/%s' % (redirect_owner, redirect_repo_name) # Make sure we insert the new location of the repo, which will # be scraped in a future run. PluginGithubRepos.upsert_with_owner_repo({ 'owner': redirect_owner, 'repo_name': redirect_repo_name, # TODO(david): Should append to a list 'redirects_from': ('%s/%s' % (repo_owner, repo_name)), }) # And now change the GitHub repo location of the plugin that # the old repo location pointed to query = r.table('plugins').get_all([repo_owner, repo_name], index='github_owner_repo') db_plugin = db.util.get_first(query) if db_plugin: db_plugin['github_owner'] = redirect_owner db_plugin['github_repo_name'] = redirect_repo_name db.plugins.insert(db_plugin, conflict='replace') print 'redirects to %s/%s.' % (redirect_owner, redirect_repo_name) else: # TODO(david): Insert some metadata in the github repo that # this is not found print 'not found.' plugin_data = None else: plugin_data = get_plugin_data(repo_owner, repo_name, repo_data) repo['repo_data'] = repo_data repo['repo_id'] = str(repo_data.get('id', repo['repo_id'])) PluginGithubRepos.log_scrape(repo) # If this is a fork, note it and ensure we know about original repo. if repo_data.get('fork'): repo['is_fork'] = True PluginGithubRepos.upsert_with_owner_repo({ 'owner': repo_data['parent']['owner']['login'], 'repo_name': repo_data['parent']['name'], }) PluginGithubRepos.upsert_with_owner_repo(repo) # For most cases we don't care about forked repos, unless the forked # repo is used by others. if repo_data.get('fork') and (repo.get('plugin_manager_users', 0) < MIN_FORK_USERS): print 'skipping fork of %s' % repo_data['parent']['full_name'] continue if plugin_data: # Insert the number of plugin manager users across all names/owners # of this repo. # TODO(david): Try to also use repo_id for this (but not all repos # have it), or look at multiple levels of redirects. plugin_manager_users = repo.get('plugin_manager_users', 0) other_repos = r.table('plugin_github_repos').get_all( '%s/%s' % (repo_owner, repo_name), index='redirects_to').run(r_conn()) for other_repo in other_repos: if other_repo['id'] == repo['id']: continue plugin_manager_users += other_repo.get('plugin_manager_users', 0) plugin_data['github_bundles'] = plugin_manager_users if repo.get('from_submission'): _add_submission_data(plugin_data, repo['from_submission']) db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get('from_submission')) print 'done.'
def scrape_plugin_repos(num): """Scrapes the num plugin repos that have been least recently scraped.""" MIN_FORK_USERS = 3 query = r.table('plugin_github_repos').filter({'is_blacklisted': False}) # We don't want to scrape forks that not many people use. query = query.filter(r.not_((r.row['is_fork'] == True) & ( r.row['plugin_manager_users'] < MIN_FORK_USERS)), default=True) # Only scrape repos that don't redirect to other ones (probably renamed). query = query.filter(r.row['redirects_to'] == '') # We scrape vim-scripts separately using the batch /users/:user/repos call query = query.filter(r.row['owner'] != 'vim-scripts') query = query.order_by('last_scraped_at').limit(num) repos = query.run(r_conn()) # TODO(david): Print stats at the end: # successfully scraped, # not found, # # redirects, etc. for repo in repos: repo_name = repo['repo_name'] repo_owner = repo['owner'] # Print w/o newline. print " scraping %s/%s ..." % (repo_owner, repo_name), sys.stdout.flush() # Attempt to fetch data about the plugin. res, repo_data = get_api_page('repos/%s/%s' % (repo_owner, repo_name)) # If the API call 404s, then see if the repo has been renamed by # checking for a redirect in a non-API call. if res.status_code == 404: res = requests.head('https://github.com/%s/%s' % ( repo_owner, repo_name)) if res.status_code == 301: location = res.headers.get('location') _, redirect_owner, redirect_repo_name = location.rsplit('/', 2) repo['redirects_to'] = '%s/%s' % (redirect_owner, redirect_repo_name) # Make sure we insert the new location of the repo, which will # be scraped in a future run. PluginGithubRepos.upsert_with_owner_repo({ 'owner': redirect_owner, 'repo_name': redirect_repo_name, # TODO(david): Should append to a list 'redirects_from': ('%s/%s' % (repo_owner, repo_name)), }) # And now change the GitHub repo location of the plugin that # the old repo location pointed to query = r.table('plugins').get_all( [repo_owner, repo_name], index='github_owner_repo') db_plugin = db.util.get_first(query) if db_plugin: db_plugin['github_owner'] = redirect_owner db_plugin['github_repo_name'] = redirect_repo_name db.plugins.insert(db_plugin, conflict='replace') print 'redirects to %s/%s.' % (redirect_owner, redirect_repo_name) else: # TODO(david): Insert some metadata in the github repo that # this is not found print 'not found.' plugin_data = None else: plugin_data = get_plugin_data(repo_owner, repo_name, repo_data) repo['repo_data'] = repo_data repo['repo_id'] = str(repo_data.get('id', repo['repo_id'])) PluginGithubRepos.log_scrape(repo) # If this is a fork, note it and ensure we know about original repo. if repo_data.get('fork'): repo['is_fork'] = True PluginGithubRepos.upsert_with_owner_repo({ 'owner': repo_data['parent']['owner']['login'], 'repo_name': repo_data['parent']['name'], }) r.table('plugin_github_repos').insert(repo, conflict='replace').run(r_conn()) # For most cases we don't care about forked repos, unless the forked # repo is used by others. if repo_data.get('fork') and ( repo.get('plugin_manager_users', 0) < MIN_FORK_USERS): print 'skipping fork of %s' % repo_data['parent']['full_name'] continue if plugin_data: # Insert the number of plugin manager users across all names/owners # of this repo. # TODO(david): Try to also use repo_id for this (but not all repos # have it), or look at multiple levels of redirects. plugin_manager_users = repo.get('plugin_manager_users', 0) other_repos = r.table('plugin_github_repos').get_all( '%s/%s' % (repo_owner, repo_name), index='redirects_to').run(r_conn()) for other_repo in other_repos: if other_repo['id'] == repo['id']: continue plugin_manager_users += other_repo.get( 'plugin_manager_users', 0) plugin_data['github_bundles'] = plugin_manager_users if repo.get('from_submission'): _add_submission_data(plugin_data, repo['from_submission']) db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get('from_submission')) print 'done.'
def scrape_plugin_repos(num): """Scrapes the num plugin repos that have been least recently scraped.""" MIN_FORK_USERS = 3 query = r.table("plugin_github_repos").filter({"is_blacklisted": False}) # We don't want to scrape forks that not many people use. query = query.filter( r.not_((r.row["is_fork"] == True) & (r.row["plugin_manager_users"] < MIN_FORK_USERS)), default=True # NOQA ) # Only scrape repos that don't redirect to other ones (probably renamed). query = query.filter(r.row["redirects_to"] == "") # We scrape vim-scripts separately using the batch /users/:user/repos call query = query.filter(r.row["owner"] != "vim-scripts") query = query.order_by("last_scraped_at").limit(num) repos = query.run(r_conn()) # TODO(david): Print stats at the end: # successfully scraped, # not found, # # redirects, etc. for repo in repos: repo_name = repo["repo_name"] repo_owner = repo["owner"] # Print w/o newline. print " scraping %s/%s ..." % (repo_owner, repo_name), sys.stdout.flush() # Attempt to fetch data about the plugin. res, repo_data = get_api_page("repos/%s/%s" % (repo_owner, repo_name)) # If the API call 404s, then see if the repo has been renamed by # checking for a redirect in a non-API call. if res.status_code == 404: res = requests.head("https://github.com/%s/%s" % (repo_owner, repo_name)) if res.status_code == 301: location = res.headers.get("location") valid_repo_url = re.compile("^https://github.com/[^/]+/[^/]+") if not valid_repo_url.match(location): print "redirects to invalid GitHub repo URL: %s" % location continue _, redirect_owner, redirect_repo_name = location.rsplit("/", 2) repo["redirects_to"] = "%s/%s" % (redirect_owner, redirect_repo_name) # Make sure we insert the new location of the repo, which will # be scraped in a future run. PluginGithubRepos.upsert_with_owner_repo( { "owner": redirect_owner, "repo_name": redirect_repo_name, # TODO(david): Should append to a list "redirects_from": ("%s/%s" % (repo_owner, repo_name)), } ) # And now change the GitHub repo location of the plugin that # the old repo location pointed to query = r.table("plugins").get_all([repo_owner, repo_name], index="github_owner_repo") db_plugin = db.util.get_first(query) if db_plugin: db_plugin["github_owner"] = redirect_owner db_plugin["github_repo_name"] = redirect_repo_name db.plugins.insert(db_plugin, conflict="replace") print "redirects to %s/%s." % (redirect_owner, redirect_repo_name) else: # TODO(david): Insert some metadata in the github repo that # this is not found print "not found." plugin_data = None else: plugin_data = get_plugin_data(repo_owner, repo_name, repo_data) repo["repo_data"] = repo_data repo["repo_id"] = str(repo_data.get("id", repo["repo_id"])) PluginGithubRepos.log_scrape(repo) # If this is a fork, note it and ensure we know about original repo. if repo_data.get("fork"): repo["is_fork"] = True PluginGithubRepos.upsert_with_owner_repo( {"owner": repo_data["parent"]["owner"]["login"], "repo_name": repo_data["parent"]["name"]} ) PluginGithubRepos.upsert_with_owner_repo(repo) # For most cases we don't care about forked repos, unless the forked # repo is used by others. if repo_data.get("fork") and (repo.get("plugin_manager_users", 0) < MIN_FORK_USERS): print "skipping fork of %s" % repo_data["parent"]["full_name"] continue if plugin_data: # Insert the number of plugin manager users across all names/owners # of this repo. # TODO(david): Try to also use repo_id for this (but not all repos # have it), or look at multiple levels of redirects. plugin_manager_users = repo.get("plugin_manager_users", 0) other_repos = ( r.table("plugin_github_repos") .get_all("%s/%s" % (repo_owner, repo_name), index="redirects_to") .run(r_conn()) ) for other_repo in other_repos: if other_repo["id"] == repo["id"]: continue plugin_manager_users += other_repo.get("plugin_manager_users", 0) plugin_data["github_bundles"] = plugin_manager_users if repo.get("from_submission"): _add_submission_data(plugin_data, repo["from_submission"]) db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get("from_submission")) print "done."