def content_similarity(query, compare_to, content_field='content'): #load similarity data from local storage: index = similarities.MatrixSimilarity.load('static/local_index.index') dictionary = corpora.Dictionary.load('static/local_dict.dict') corpus = corpora.MmCorpus('static/local_corpus.mm') lsi=models.LsiModel.load('static/local_lsi.lsi') G=json_graph.load(open("static/local_instance.json")) #list all titles to generate top similar topics for debugging titles=[title for title in [G.node[each_node]['title'] for each_node in G.nodes()]] #Gensim similarity computation query=G.node[get_id(query)][content_field] vec_bow = dictionary.doc2bow(query.lower().split()) vec_lsi = lsi[vec_bow] # convert the query to LSI space sims = index[vec_lsi] #sort topics in descending order of similarity (debugging) sorted_sims = sorted(enumerate(sims), key=lambda item: -item[1]) print "-----------------------------------------------------------------" for each in sorted_sims[0:15]: print str(each)+" --> "+titles[each[0]] print "-----------------------------------------------------------------" return sims[titles.index(compare_to)]
def nca(name1, name2): G = json_graph.load(open("static/local_instance.json")) frontier1 = [get_id(name1)] frontier2 = [get_id(name2)] done = False while not done: #retrieve nodes in next BFS shell shell1 = list( chain.from_iterable(G.predecessors(each) for each in frontier1)) shell2 = list( chain.from_iterable(G.predecessors(each) for each in frontier2)) #no new nodes. End of the line if not shell1 and not shell2: return [] frontier1 += shell1 frontier2 += shell2 intersect = set(frontier1) & set(frontier2) if intersect: done = True #print intersect return [(nx.shortest_path(G, ancestor, get_id(name1)), nx.shortest_path(G, ancestor, get_id(name2))) for ancestor in list(intersect)]
def nca(name1, name2): G=json_graph.load(open("static/local_instance.json")) frontier1=[get_id(name1)] frontier2=[get_id(name2)] done=False while not done: #retrieve nodes in next BFS shell shell1=list(chain.from_iterable(G.predecessors(each) for each in frontier1)) shell2=list(chain.from_iterable(G.predecessors(each) for each in frontier2)) #no new nodes. End of the line if not shell1 and not shell2: return [] frontier1+=shell1 frontier2+=shell2 intersect=set(frontier1)&set(frontier2) if intersect: done=True #print intersect return [(nx.shortest_path(G,ancestor,get_id(name1)),nx.shortest_path(G,ancestor,get_id(name2))) for ancestor in list(intersect)]
def content_similarity(query, compare_to, content_field='content'): #load similarity data from local storage: index = similarities.MatrixSimilarity.load('static/local_index.index') dictionary = corpora.Dictionary.load('static/local_dict.dict') corpus = corpora.MmCorpus('static/local_corpus.mm') lsi = models.LsiModel.load('static/local_lsi.lsi') G = json_graph.load(open("static/local_instance.json")) #list all titles to generate top similar topics for debugging titles = [ title for title in [G.node[each_node]['title'] for each_node in G.nodes()] ] #Gensim similarity computation query = G.node[get_id(query)][content_field] vec_bow = dictionary.doc2bow(query.lower().split()) vec_lsi = lsi[vec_bow] # convert the query to LSI space sims = index[vec_lsi] #sort topics in descending order of similarity (debugging) sorted_sims = sorted(enumerate(sims), key=lambda item: -item[1]) print "-----------------------------------------------------------------" for each in sorted_sims[0:15]: print str(each) + " --> " + titles[each[0]] print "-----------------------------------------------------------------" return sims[titles.index(compare_to)]
def data(self, **kw): try: with closing(open('cache.json', 'r')) as data_file: print 'Reading from cache' return data_file.read() except IOError: print 'Fetching data' with closing(open('cache.json', 'w')) as data_file: foaf_graph = None try: with closing(open('graph_cache.json', 'r')) as graph_file: print 'Reading from graph cache' foaf_graph = jg.load(graph_file) except IOError: foaf_graph = retrieve_foaf(FBTOKEN) clusters = community.best_partition(foaf_graph) degree_distribution = get_histograms(foaf_graph) cluster_counts = get_cluster_counts(clusters) top10 = get_top_degree(foaf_graph, 10) foaf_json_graph = json.loads(jg.dumps(foaf_graph)) ob = foaf_graph.degree() infos = { 'graph':foaf_json_graph, 'clusters':clusters, 'cluster_counts':cluster_counts, 'degree_distribution':degree_distribution, 'degree':foaf_graph.degree(), 'top10':top10 } foaf_data = json.dumps(infos) data_file.write(foaf_data) return foaf_data
def analyzeGraph(self, jsonFile, level=10): data = [] nxg = json_graph.load(open(jsonFile)) for n in nxg.nodes(data=True): if nxg.in_degree(n[0]) == 0: rootNode = n break paths = nx.single_source_shortest_path(nxg,rootNode[0],level) nodes = {} # Dictionary to keep track of nodes at length x from root node for k,v in paths.items(): if k == rootNode[0]: continue # exclude root node if not nodes.has_key(len(v) - 1): nodes[len(v) - 1] = [] nodes[len(v) - 1].append(k) # cTotal = 0 # cumulative total for k in sorted(nodes.keys()): bunch = [rootNode[0]] for i in range(1,k + 1): bunch.extend(nodes[i]) subgraph = nxg.subgraph(bunch) data.append({'name' : rootNode[1]['name'], 'level' : k, 'node_cnt' : subgraph.number_of_nodes(), 'edge_cnt' : subgraph.number_of_edges()}) return data
def read_json_file(filename, info=True): ''' Use if you already have a json rep of a graph and want to update/modify it ''' graph = json_graph.load(open(filename)) if info: print "Read in file ", filename print nx.info(graph) return graph
def min_path(name1, name2): G=json_graph.load(open("static/local_instance.json")) path=[] #check in both directions. try: path=nx.shortest_path(G,get_id(name1),get_id(name2)) except: path=nx.shortest_path(G,get_id(name2),get_id(name1)) return path
def color_path(path, color): G = json_graph.load(open("static/local_instance.json")) H = G.subgraph(path) for each in zip(path, path[1:]): H.edge[each[0]][each[1]]['color'] = color H.node[each[1]]['color'] = color H.node[path[0]]['color'] = color return H
def min_path(name1, name2): G = json_graph.load(open("static/local_instance.json")) path = [] #check in both directions. try: path = nx.shortest_path(G, get_id(name1), get_id(name2)) except: path = nx.shortest_path(G, get_id(name2), get_id(name1)) return path
def color_path(path, color): G=json_graph.load(open("static/local_instance.json")) H=G.subgraph(path) for each in zip(path,path[1:]): H.edge[each[0]][each[1]]['color']=color H.node[each[1]]['color']=color H.node[path[0]]['color']=color return H
def expand_graph(H, direction='in'): G=json_graph.load(open("static/local_instance.json")) #add predecessor or successor nodes depending on 'direction' frontier=[] if direction is 'out': frontier=list(chain.from_iterable(G.successors(each) for each in H.nodes())) elif direction is 'in': frontier=list(chain.from_iterable(G.predecessors(each) for each in H.nodes())) return nx.compose(H,G.subgraph(frontier+H.nodes()))
def load_graph(fn): if fn is None: return None if not os.path.isfile(fn): print >> sys.stderr, "MISSING FILE %s" % fn return None if re.match(r".*\.gz$", fn): f = gzip.open(fn, "r") else: f = open(fn, "r") return json_graph.load(f)
def computeDiameter(): G=json_graph.load(open("static/local_instance.json")) diameter=0 for g in nx.connected_component_subgraphs(G.to_undirected()): try: diameter= max(diameter,nx.diameter(g)) except: pass with open("static/diameter.dat", "w") as f: f.write("%f"%diameter)
def findPath(self, jsonFile, celebrity): nxg = json_graph.load(open(jsonFile)) for n in nxg.nodes(data=True): if nxg.in_degree(n[0]) == 0: rootNode = n break reverseNxg = nxg.reverse(copy=True) for node in reverseNxg.nodes(data=True): if node[1]['name'] == celebrity: for p in nx.all_simple_paths(reverseNxg,node[0],rootNode[0]): print [nxg.node[x]['name'] for x in p] break
def expand_graph(H, direction='in'): G = json_graph.load(open("static/local_instance.json")) #add predecessor or successor nodes depending on 'direction' frontier = [] if direction is 'out': frontier = list( chain.from_iterable(G.successors(each) for each in H.nodes())) elif direction is 'in': frontier = list( chain.from_iterable(G.predecessors(each) for each in H.nodes())) return nx.compose(H, G.subgraph(frontier + H.nodes()))
def wikilinks_graph(self): """ Generate a wikilinks graph using networkx :rtype: Graph """ import tempfile from networkx.readwrite import json_graph import networkx as nx import re import requests tmpdir = tempfile.gettempdir() graph_object = tmpdir + '/' + str(self.id) + '.wikilinks.json' def _get_links(ngram): ngram_links = json.loads(requests.get(template_query.format(ngram)).text) try: ngram_links = ngram_links['query']['pages'].values()[0]['links'] except KeyError: return [] ngram_links = [re.sub(r' \(.+\)', '', link['title'].lower()) for link in ngram_links] ngram_links = set([ngram for ngram in ngram_links if len(ngram.split()) > 1]) return ngram_links if not os.path.exists(graph_object): graph = nx.Graph() links_dict = {} template_query = u'http://en.wikipedia.org/w/api.php?action=query&titles={0}&prop=links&plnamespace=0&pllimit=500&format=json' article_ngrams = list(self.articlecollocation_set.values_list('ngram', flat=True)) for i, ngram1 in enumerate(article_ngrams): if ngram1 in links_dict: ngram1_links = links_dict[ngram1] else: ngram1_links = _get_links(ngram1) links_dict[ngram1] = ngram1_links for j in range(i+1, len(article_ngrams)): ngram2 = article_ngrams[j] if ngram2 in links_dict: ngram2_links = links_dict[ngram2] else: ngram2_links = _get_links(ngram2) links_dict[ngram2] = ngram2_links if ngram1 in ngram2_links or ngram2 in ngram1_links: graph.add_edge(ngram1, ngram2) json_graph.dump(graph, open(graph_object, 'w')) return graph else: graph = json_graph.load(open(graph_object)) return graph
def concept_nbh(name, levels): G = json_graph.load(open("static/local_instance.json")) G = G.subgraph(get_id(name)) #the user might not know his numbers very well try: levels = int(levels) except: levels = 0 for i in range(levels): G = expand_graph(G) data = json_graph.dumps(G) return data
def concept_nbh(name, levels): G=json_graph.load(open("static/local_instance.json")) G=G.subgraph(get_id(name)) #the user might not know his numbers very well try: levels=int(levels) except: levels=0 for i in range(levels): G=expand_graph(G) data=json_graph.dumps(G) return data
def show_nca(name1, name2, levels=0): nca_list=nca(name1, name2) G=json_graph.load(open("static/local_instance.json")) H=nx.DiGraph() for each in nca_list: anc_path=nx.compose(color_path(each[0],'green'),color_path(each[1],'yellow')) H=nx.compose(H,anc_path) for i in range(levels): H=expand_graph(H) for each in nca_list: H.node[each[0][0]]['color']='red' #color the nca different data=json_graph.dumps(H) return data
def show_nca(name1, name2, levels=0): nca_list = nca(name1, name2) G = json_graph.load(open("static/local_instance.json")) H = nx.DiGraph() for each in nca_list: anc_path = nx.compose(color_path(each[0], 'green'), color_path(each[1], 'yellow')) H = nx.compose(H, anc_path) for i in range(levels): H = expand_graph(H) for each in nca_list: H.node[each[0][0]]['color'] = 'red' #color the nca different data = json_graph.dumps(H) return data
def similarity(name1, name2, content_field='content'): G = json_graph.load(open("static/local_instance.json")) H = nx.DiGraph() path = nx.shortest_path(G, get_id(name1), get_id(name2)) diameter = 0 with open("static/diameter.dat", "r") as f: diameter = float(f.read()) f1 = len(path) / float(diameter) print "minpath factor: %f" % f1 nca_list = nca(name1, name2) #do something with the information #print "nca factor: %f"%f2 f3 = content_similarity(G.node[get_id(name1)][content_field], name2) print "content sim factor: %f" % f3
def similarity(name1, name2,content_field='content'): G=json_graph.load(open("static/local_instance.json")) H=nx.DiGraph(); path=nx.shortest_path(G,get_id(name1),get_id(name2)) diameter=0 with open("static/diameter.dat", "r") as f: diameter=float(f.read()) f1=len(path)/float(diameter) print "minpath factor: %f"%f1 nca_list=nca(name1,name2) #do something with the information #print "nca factor: %f"%f2 f3=content_similarity(G.node[get_id(name1)][content_field], name2) print "content sim factor: %f"%f3
def trainCorpus(content_field='content'): G=json_graph.load(open("static/local_instance.json")) sentences=[] for each_node in G.nodes(): sentences.append(G.node[each_node][content_field]) #pre-process text: convert to lowercase, remove numbers/symbols, remove stopwords texts=[] for string in sentences: words = re.findall(r'[a-z]+', string.lower()) imp_words = filter(lambda x: x not in stopwords.words('english'), words) texts.append(imp_words) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100) #num_topics should be around 200 for a large corpus index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it #locally store computed information dictionary.save('static/local_dict.dict') corpora.MmCorpus.serialize('static/local_corpus.mm', corpus) index.save('static/local_index.index') lsi.save('static/local_lsi.lsi')
def read_json_file(self, filename): self.G = json_graph.load(open(filename)) print "Read in file ", filename
def read_json_file(filename, info=True): graph = json_graph.load(open(filename)) if info: print("Read in file ", filename) print(nx.info(graph)) return graph
def read_json_file(filename, info=True): graph = json_graph.load(open(filename)) if info: print "Read in file ", filename print nx.info(graph) return graph
def local_mega_graph(): H = json_graph.load(open("static/local_instance.json")) data = json_graph.dumps(H) return data
def local_mega_graph(): H=json_graph.load(open("static/local_instance.json")) data=json_graph.dumps(H) return data
def read_json_file(filename): graph = json_graph.load(open(filename)) print "Read in file", filename print nx.info(graph).replace("\n"," | ") return graph
def main(): LOGGER.setLevel(logging.DEBUG) APILOGGER.setLevel(logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('seed', metavar='U', type=str, nargs='+',\ help='seed users') parser.add_argument('--saved', dest='savefile', metavar='SAVED', type=str,\ help='saved progress .json', default='') args = parser.parse_args() oauth = {} with open('oauth.json') as raw: oauth = json.load(raw) tapi = api.Wrapper(oauth['access'], oauth['accessSecret'],\ oauth['consumer'], oauth['consumerSecret']) if args.savefile: with open(args.savefile) as saved: graph = nxjson.load(saved) else: graph = nx.DiGraph() seed = set(map(int, args.seed)) users = seed.union(*[tapi.followers(acct) for acct in seed]) graph.add_nodes_from(users) users = graph.nodes() progress = 0 total = 0 for user in users: total += 1 if graph.node[user].get('username'): continue info = tapi.info(user) if not info.get('screen_name') or info.get('protected'): graph.remove_node(user) continue progress += 1 LOGGER.debug('Adding user %s aka @%s', user, info.get('screen_name')) graph.add_node(user, name=info.get('name',''), location=info.get('location',''),\ followers=info.get('followers_count',0), lang=info.get('lang',''),\ following=info.get('friends_count',0), username=info.get('screen_name',''),\ protected=info.get('protected')) if progress == 100: progress = 0 with open('tmp.json', 'w') as garph: nxjson.dump(graph, garph) LOGGER.info('Saved info for %s/%s users', total, len(users)) LOGGER.info('Info collected') with open('garph.json', 'w') as garph: nxjson.dump(graph, garph) nodeSet = set(graph.nodes()) users = graph.nodes() # Removed some nodes earlier, can't try to access them progress = 0 while users: # for some reason no edges to seed, but they are in users because we got their data user = users.pop() if graph.node[user].get('complete'): continue graph.node[user]['complete'] = True try: followers = tapi.followers(user) except api.NoDataError: continue relevant = followers & nodeSet graph.node[user]['follower_ids'] = ','.join(map(str, followers)) LOGGER.debug('Adding edges for user %s aka @%s', user, graph.node[user]['username']) graph.add_edges_from([(follower, user) for follower in relevant]) progress += 1 if progress == 5: progress = 0 with open('tmp.json', 'w') as garph: nxjson.dump(graph, garph) LOGGER.info('Saved graph with %s user edgesets remaining', len(users)) with open('garph.json', 'w') as garph: nxjson.dump(graph, garph) nx.write_gexf(graph, 'garph.gexf') LOGGER.info('Edges collected') LOGGER.info('Job complete')
def dbpedia_graph(self, redirects=True): """ Generate a dbpedia category TREE using networkx :rtype: nx.Graph """ import tempfile import requests from networkx.readwrite import json_graph tmpdir = tempfile.gettempdir() if redirects: graph_object = tmpdir + '/' + str(self.id) + 'redirects.' + '.dbpedia.json' else: graph_object = tmpdir + '/' + str(self.id) + '.dbpedia.json' if not os.path.exists(graph_object): stop_uris_set = open(settings.ABS_PATH('stop_uri.txt')).read().split() stop_uris_set = set([x.split('/')[-1] for x in stop_uris_set]) def recurse_populate_graph(resource, graph, depth): if resource in stop_uris_set: return if depth == 0: return if 'Category' in resource: query = u'SELECT ?broader, ?related, ?broaderof WHERE' \ u' {{{{ <http://dbpedia.org/resource/{0}> skos:broader ?broader }}' \ u' UNION {{ ?broaderof skos:broader <http://dbpedia.org/resource/{0}> }}' \ u' UNION {{ ?related skos:related <http://dbpedia.org/resource/{0}> }}' \ u' UNION {{ <http://dbpedia.org/resource/{0}> skos:related ?related }}}}'.format(resource) results = [] sparql = SPARQLWrapper("http://dbpedia.org/sparql") sparql.setReturnFormat(JSON) sparql.setQuery(query) results.extend(sparql.query().convert()['results']['bindings']) for result in results: for rel_type, value in result.iteritems(): uri = value['value'] parent_resource = uri.split('/')[-1] #print ' ' * (3 - depth), resource, '->', parent_resource graph.add_edge(resource, parent_resource, type=rel_type) recurse_populate_graph(parent_resource, graph, depth-1) else: if resource == 'cumulative gain': resource = 'Discounted_cumulative_gain' elif resource == 'world wide web conference': resource = 'International_World_Wide_Web_Conference' wiki_cat_query = u'http://en.wikipedia.org/w/api.php?action=query&titles={0}&prop=categories&cllimit=50&clshow=!hidden&format=json&redirects' results = json.loads(requests.get(wiki_cat_query.format(resource)).text)['query']['pages'].values()[0] if 'missing' in results: results = json.loads(requests.get(wiki_cat_query.format(resource.title())).text)['query']['pages'].values()[0] if 'missing' in results: print results, resource results = [] else: results = [c['title'].replace(' ', '_') for c in results['categories']] else: results = [c['title'].replace(' ', '_') for c in results['categories']] rel_type = "subject" for parent_resource in results: #print ' ' * (3 - depth), resource, '->', parent_resource graph.add_edge(resource, parent_resource, type=rel_type) recurse_populate_graph(parent_resource, graph, depth-1) import networkx as nx from SPARQLWrapper import SPARQLWrapper, JSON graph = nx.Graph() ngrams = set(self.articlecollocation_set.values_list('ngram', flat=True)) ngrams = self.CollocationModel.COLLECTION_MODEL.objects.filter(ngram__in=ngrams) for ngram in ngrams: if 'dbpedia' in ngram.source or (redirects and 'wiki_redirect' in ngram.source): recurse_populate_graph(ngram.ngram, graph, 2) json_graph.dump(graph, open(graph_object, 'w')) else: graph = json_graph.load(open(graph_object)) return graph