def get_globalusage(self, site, image, shared = False): self.connect_http() if type(site) is str: hostname = site apipath = '/w/api.php' else: hostname = site.hostname() apipath = site.apipath() kwargs = {'action': 'query', 'titles': u'File:' + image, 'prop': 'globalusage|imageinfo', 'iiprop': '', 'guprop': 'namespace', 'gulimit': '500'} while True: res = self.http.query_api(hostname, apipath, **kwargs) if not res or not res['query'] or not res['query']['pages']: return if res['query']['pages'].values()[0].get('imagerepository') == 'local' and shared: return usages = res['query']['pages'].values()[0].get('globalusage', ()) for usage in usages: title = usage['title'].replace(' ', '_') namespace = int(usage['ns']) site = family(usage['wiki']) if namespace != 0: yield site, (namespace, strip_ns(title), title) else: yield site, (namespace, title, title) if 'globalusage' in res.get('query-continue', ()): kwargs.update(res['query-continue']['globalusage']) else: return
def __init__(self, limit = 100, mysql_default_server = 2, mysql_host_prefix = 'sql-s', mysql_kwargs = {}, no_db = False, use_autoconn = False, http_retry_timeout = 30, http_max_retries = -1, http_callback = lambda *args: None, mysql_retry_timeout = 60, mysql_max_retries = -1, mysql_callback = lambda *args: None): self.http = None self.http_retry_timeout = http_retry_timeout self.http_max_retries = http_max_retries self.http_callback = http_callback if no_db: return self.mysql_host_prefix = mysql_host_prefix if 'host' in mysql_kwargs: del mysql_kwargs['host'] self.mysql_kwargs = mysql_kwargs self.use_autoconn = use_autoconn self.mysql_retry_timeout = mysql_retry_timeout self.mysql_max_retries = mysql_max_retries self.mysql_callback = mysql_callback self.connections = [] # Mapping database name -> mysql connection self.databases = {} # Mapping server id -> mysql connection self.servers = {} # Mapping database name -> (lang, family) self.sites = {} self.unknown_families = [] # Mapping family name -> family object self.known_families = {} database, cursor = self.connect_mysql(mysql_host_prefix + str(mysql_default_server)) self.servers[mysql_default_server] = (database, cursor) # Find where the databases are located cursor.execute('SELECT dbname, domain, server FROM toolserver.wiki ORDER BY size DESC LIMIT %s', (limit, )) for dbname, domain, server in cursor.fetchall(): if server not in self.servers: self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server)) # FIXME: wikimediafoundation! try: lang, fam = family(domain) if fam not in self.known_families: self.known_families[fam] = wikipedia.Family(fam, fatal = False) except (RuntimeError, ValueError): self.unknown_families.append(domain) else: self.sites[dbname] = (lang, fam) self.databases[dbname] = self.servers[server]
def startElement(self, name, attr): if name == "Family": self.family = family() self.family.name = attr.getValue("Name") if name == "Product": attributes = attr.getNames() prod = product() prod.name = attr.getValue("Name") if "DisplayMode" in attributes: prod.displayMode = attr.getValue("DisplayMode") else: prod.displayMode = "Visible" self.family.products.append(prod)
def return_family_list(n=1000, m=5, seed=0): # generate fam_list G = nx.random_graphs.barabasi_albert_graph(n, m, seed) fam_list = list() adj = list(G.edges()) for g in G: fam = family(label=g) fam_list.append(fam) for g_1, g_2 in adj: f_1 = fam_list[g_1] f_2 = fam_list[g_2] f_1.relate(f_2) return fam_list, G
def __init__(self, limit = 100, mysql_default_server = 3, mysql_host_prefix = 'sql-s', mysql_host_suffix = '', mysql_kwargs = {}, no_db = False, use_autoconn = False, http_retry_timeout = 30, http_max_retries = -1, http_callback = lambda *args: None, mysql_retry_timeout = 60, mysql_max_retries = -1, mysql_callback = lambda *args: None): self.http = None self.http_retry_timeout = http_retry_timeout self.http_max_retries = http_max_retries self.http_callback = http_callback if no_db: return self.mysql_host_prefix = mysql_host_prefix self.mysql_kwargs = mysql_kwargs.copy() # To be safe if 'host' in self.mysql_kwargs: del self.mysql_kwargs['host'] self.use_autoconn = use_autoconn self.mysql_retry_timeout = mysql_retry_timeout self.mysql_max_retries = mysql_max_retries self.mysql_callback = mysql_callback self.connections = [] # Mapping database name -> mysql connection self.databases = {} # Mapping server id -> mysql connection self.servers = {} # Mapping database name -> (lang, family) self.sites = {} self.domains = {} self.unknown_families = [] # Mapping family name -> family object self.known_families = {} database, cursor = self.connect_mysql(mysql_host_prefix + str(mysql_default_server)) self.servers[mysql_default_server] = (database, cursor) # Find where the databases are located cursor.execute('SELECT dbname, domain, server FROM toolserver.wiki ORDER BY size DESC LIMIT %s', (limit, )) for dbname, domain, server in cursor.fetchall(): if server not in self.servers: self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server) + mysql_host_suffix) # FIXME: wikimediafoundation! # TODO: This is one big mess try: lang, fam = family(domain) if fam not in self.known_families: self.known_families[fam] = wikipedia.Family(fam, fatal = False) except (RuntimeError, ValueError, SyntaxError): self.unknown_families.append(domain) else: self.sites[dbname] = (lang, fam) self.databases[dbname] = self.servers[server] self.domains[dbname] = domain