def find_named_ecosystems(s, cutoff=5, g=None): """Return dict of named ecosystems and their sizes (split by . and - seperators.)""" if not g: g = nx.DiGraph(get_pkg_edgelist(s)) # Consider something worthy of searching if its indegree is more or equal to cutoff indegs = [i for i in g.in_degree().items() if i[1] >= cutoff] search_names = [] for t in indegs: split_char = '' pkg_name = s.query( db.Package.name).filter(db.Package.id == t[0]).first()[0] split_search = re.search('\w+([.-])', pkg_name) if split_search and len(split_search.groups()) == 1: split_char = split_search.group(1) if not pkg_name.split(split_char)[0] in search_names: search_names.append(pkg_name.split(split_char)[0]) def name_searcher(sep_char, search_names): returner = [] for n in search_names: name_count = s.query(db.Package.name).filter( db.Package.name.startswith(n + sep_char)).count() returner.append([n, name_count]) returner.sort(key=lambda tup: tup[1], reverse=True) returner = [r for r in returner if r[1] > 1] return returner # dot/dash search return { 'dot-ecosystems': name_searcher('.', search_names), 'dash-ecosystems': name_searcher('-', search_names) }
def packages_with_selfloops(s, g=None): """Return a list of Packages which require themselves.""" if not g: g = create_graph(get_pkg_nodelist(s), get_pkg_edgelist(s)) id_list = g.nodes_with_selfloops() names = [] for i in id_list: names.append( s.query(db.Package.name).filter(db.Package.id == i).first()) return names
def top_required_packages(s, top=None, g=None): """Return list of top required packages and the number of times they are required.""" if not g: g = nx.DiGraph(get_pkg_edgelist(s)) indegs = list(g.in_degree().items()) indegs.sort(key=lambda tup: tup[1], reverse=True) named_top = [] for t in indegs[:top]: named_top.append( [s.query(db.Package).filter(db.Package.id == t[0]).first(), t[1]]) return named_top
def package_out_degree_distribution_chart(s, filename, g=None): """Create a in degree distribution chart.""" if not g: g = create_graph(get_pkg_nodelist(s), get_pkg_edgelist(s)) deg_seq = sorted(g.out_degree().values(), reverse=True) plt.hist(deg_seq, bins=range(0, 20, 1), normed=True) plt.xticks(range(0, 20, 1)) plt.title('Requirement graph outdegree distribution chart') plt.xlabel('Outdegree') plt.ylabel('Frequency') plt.savefig(filename) plt.close()
def strong_weak_package_connections(s, g=None): if not g: g = create_graph(get_pkg_nodelist(s), get_pkg_edgelist(s)) strong = [ t for t in list(nx.strongly_connected_components(g)) if len(t) > 1 ] strong_names = [] for c in strong: names = [] for p in c: names.append( s.query(db.Package.name).filter(db.Package.id == p).first()) strong_names.append(names) weak = [t for t in list(nx.weakly_connected_components(g)) if len(t) > 1] weak_names = [] for c in weak: names = [] for p in c: names.append( s.query(db.Package.name).filter(db.Package.id == p).first()) weak_names.append(names) return {'strong': strong_names, 'weak': weak_names}
def downloads_vs_indegree(s, filename, g=None): """Create chart of the number of downloads per package vs. the number of times it is required, and return this data as a dict.""" if not g: g = nx.DiGraph(get_pkg_edgelist(s)) plot_data = [] for n in g.nodes(): plot_data.append([ g.in_degree(n), s.query(func.sum(db.Release.downloads)).filter( db.Release.current == True).filter( db.Release.package_id == n).first()[0] ]) y, x = zip(*plot_data) plt.loglog(x, y, marker=',', linestyle='None') plt.title('Downloads vs. # times required') plt.ylabel('# times required') plt.xlabel('Downloads') plt.ylim([0, 1000]) #plt.xlim([0, max(i for i in x if i is not None)+25]) plt.grid(True) plt.savefig(filename) plt.close() return plot_data