示例#1
0
def content_similarity(query, compare_to, content_field='content'):
	
	#load similarity data from local storage:
	index = similarities.MatrixSimilarity.load('static/local_index.index')
	dictionary = corpora.Dictionary.load('static/local_dict.dict')
	corpus = corpora.MmCorpus('static/local_corpus.mm')
	lsi=models.LsiModel.load('static/local_lsi.lsi')

	G=json_graph.load(open("static/local_instance.json"))
	#list all titles to generate top similar topics for debugging	
	titles=[title for title in [G.node[each_node]['title'] for each_node in G.nodes()]]

	#Gensim similarity computation
	query=G.node[get_id(query)][content_field]
	vec_bow = dictionary.doc2bow(query.lower().split())
	vec_lsi = lsi[vec_bow] # convert the query to LSI space
	sims = index[vec_lsi]

	#sort topics in descending order of similarity (debugging)
	
	sorted_sims = sorted(enumerate(sims), key=lambda item: -item[1])
	print "-----------------------------------------------------------------"
	for each in sorted_sims[0:15]:
		print str(each)+" --> "+titles[each[0]]
	print "-----------------------------------------------------------------"
	
	return sims[titles.index(compare_to)]
示例#2
0
def nca(name1, name2):
    G = json_graph.load(open("static/local_instance.json"))

    frontier1 = [get_id(name1)]
    frontier2 = [get_id(name2)]

    done = False
    while not done:
        #retrieve nodes in next BFS shell
        shell1 = list(
            chain.from_iterable(G.predecessors(each) for each in frontier1))
        shell2 = list(
            chain.from_iterable(G.predecessors(each) for each in frontier2))

        #no new nodes. End of the line
        if not shell1 and not shell2:
            return []

        frontier1 += shell1
        frontier2 += shell2
        intersect = set(frontier1) & set(frontier2)

        if intersect:
            done = True
            #print intersect

    return [(nx.shortest_path(G, ancestor, get_id(name1)),
             nx.shortest_path(G, ancestor, get_id(name2)))
            for ancestor in list(intersect)]
示例#3
0
def nca(name1, name2):
	G=json_graph.load(open("static/local_instance.json"))	

	frontier1=[get_id(name1)]
	frontier2=[get_id(name2)]
	
	done=False
	while not done:
		#retrieve nodes in next BFS shell
		shell1=list(chain.from_iterable(G.predecessors(each) for each in frontier1))
		shell2=list(chain.from_iterable(G.predecessors(each) for each in frontier2))

		#no new nodes. End of the line
		if not shell1 and not shell2:
			return []
		
		frontier1+=shell1
		frontier2+=shell2
		intersect=set(frontier1)&set(frontier2)
	
		if intersect:
			done=True
			#print intersect

	return [(nx.shortest_path(G,ancestor,get_id(name1)),nx.shortest_path(G,ancestor,get_id(name2))) for ancestor in list(intersect)]
示例#4
0
def content_similarity(query, compare_to, content_field='content'):

    #load similarity data from local storage:
    index = similarities.MatrixSimilarity.load('static/local_index.index')
    dictionary = corpora.Dictionary.load('static/local_dict.dict')
    corpus = corpora.MmCorpus('static/local_corpus.mm')
    lsi = models.LsiModel.load('static/local_lsi.lsi')

    G = json_graph.load(open("static/local_instance.json"))
    #list all titles to generate top similar topics for debugging
    titles = [
        title
        for title in [G.node[each_node]['title'] for each_node in G.nodes()]
    ]

    #Gensim similarity computation
    query = G.node[get_id(query)][content_field]
    vec_bow = dictionary.doc2bow(query.lower().split())
    vec_lsi = lsi[vec_bow]  # convert the query to LSI space
    sims = index[vec_lsi]

    #sort topics in descending order of similarity (debugging)

    sorted_sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print "-----------------------------------------------------------------"
    for each in sorted_sims[0:15]:
        print str(each) + " --> " + titles[each[0]]
    print "-----------------------------------------------------------------"

    return sims[titles.index(compare_to)]
 def data(self, **kw):
     try:
         with closing(open('cache.json', 'r')) as data_file:
             print 'Reading from cache'
             return data_file.read()
     except IOError:
         print 'Fetching data'
         with closing(open('cache.json', 'w')) as data_file:
             foaf_graph = None
             try:
                 with closing(open('graph_cache.json', 'r')) as graph_file:
                     print 'Reading from graph cache'
                     foaf_graph = jg.load(graph_file)
             except IOError:
                 foaf_graph = retrieve_foaf(FBTOKEN)
             clusters = community.best_partition(foaf_graph)
             degree_distribution = get_histograms(foaf_graph)
             cluster_counts = get_cluster_counts(clusters)
             top10 = get_top_degree(foaf_graph, 10)
             foaf_json_graph = json.loads(jg.dumps(foaf_graph))
             ob = foaf_graph.degree()
             infos = {
                 'graph':foaf_json_graph,
                 'clusters':clusters,
                 'cluster_counts':cluster_counts,
                 'degree_distribution':degree_distribution,
                 'degree':foaf_graph.degree(),
                 'top10':top10
             }
             foaf_data = json.dumps(infos)
             data_file.write(foaf_data)
             return foaf_data
示例#6
0
    def analyzeGraph(self, jsonFile, level=10):
        data = []
        nxg = json_graph.load(open(jsonFile))
        for n in nxg.nodes(data=True):
            if nxg.in_degree(n[0]) == 0:
                rootNode = n
                break 
        paths = nx.single_source_shortest_path(nxg,rootNode[0],level)
        nodes = {} # Dictionary to keep track of nodes at length x from root node
        for k,v in paths.items():
            if k == rootNode[0]: continue # exclude root node
            if not nodes.has_key(len(v) - 1):
                nodes[len(v) - 1] = []
            nodes[len(v) - 1].append(k)
                                     
#        cTotal = 0 # cumulative total

        for k in sorted(nodes.keys()):
            bunch = [rootNode[0]]
            for i in range(1,k + 1):
                bunch.extend(nodes[i])
            subgraph = nxg.subgraph(bunch)
            data.append({'name' : rootNode[1]['name'],
                         'level' : k,
                         'node_cnt' : subgraph.number_of_nodes(),
                         'edge_cnt' : subgraph.number_of_edges()})
        return data
def read_json_file(filename, info=True):
	'''
	Use if you already have a json rep of a graph and want to update/modify it
	'''
	graph = json_graph.load(open(filename))
	if info:
		print "Read in file ", filename
		print nx.info(graph)
	return graph
示例#8
0
def min_path(name1, name2):
	G=json_graph.load(open("static/local_instance.json"))
	path=[]

	#check in both directions.
	try:
		path=nx.shortest_path(G,get_id(name1),get_id(name2))
	except:
		path=nx.shortest_path(G,get_id(name2),get_id(name1))
	return path
示例#9
0
def color_path(path, color):
    G = json_graph.load(open("static/local_instance.json"))
    H = G.subgraph(path)

    for each in zip(path, path[1:]):
        H.edge[each[0]][each[1]]['color'] = color
        H.node[each[1]]['color'] = color
    H.node[path[0]]['color'] = color

    return H
示例#10
0
def min_path(name1, name2):
    G = json_graph.load(open("static/local_instance.json"))
    path = []

    #check in both directions.
    try:
        path = nx.shortest_path(G, get_id(name1), get_id(name2))
    except:
        path = nx.shortest_path(G, get_id(name2), get_id(name1))
    return path
示例#11
0
def color_path(path, color):
	G=json_graph.load(open("static/local_instance.json"))
	H=G.subgraph(path)

	for each in zip(path,path[1:]):
		H.edge[each[0]][each[1]]['color']=color
		H.node[each[1]]['color']=color	
	H.node[path[0]]['color']=color

	return H
示例#12
0
def expand_graph(H, direction='in'):
	G=json_graph.load(open("static/local_instance.json"))

	#add predecessor or successor nodes depending on 'direction'
	frontier=[]
	if direction is 'out':
		frontier=list(chain.from_iterable(G.successors(each) for each in H.nodes()))
	elif direction is 'in':
		frontier=list(chain.from_iterable(G.predecessors(each) for each in H.nodes()))
	return nx.compose(H,G.subgraph(frontier+H.nodes()))
def load_graph(fn):
    if fn is None:
        return None
    if not os.path.isfile(fn):
        print >> sys.stderr, "MISSING FILE %s" % fn
        return None
    if re.match(r".*\.gz$", fn):
        f = gzip.open(fn, "r")
    else:
        f = open(fn, "r")
    return json_graph.load(f)
示例#14
0
def computeDiameter():
	G=json_graph.load(open("static/local_instance.json"))
	diameter=0
	for g in nx.connected_component_subgraphs(G.to_undirected()):
		try:
			diameter= max(diameter,nx.diameter(g))
		except:
			pass

	with open("static/diameter.dat", "w") as f:
		f.write("%f"%diameter)
示例#15
0
 def findPath(self, jsonFile, celebrity):
     nxg = json_graph.load(open(jsonFile))
     for n in nxg.nodes(data=True):
         if nxg.in_degree(n[0]) == 0:
             rootNode = n
             break 
     reverseNxg = nxg.reverse(copy=True)
     for node in reverseNxg.nodes(data=True):
         if node[1]['name'] == celebrity:
             for p in nx.all_simple_paths(reverseNxg,node[0],rootNode[0]):
                 print [nxg.node[x]['name'] for x in p]
             break 
示例#16
0
def expand_graph(H, direction='in'):
    G = json_graph.load(open("static/local_instance.json"))

    #add predecessor or successor nodes depending on 'direction'
    frontier = []
    if direction is 'out':
        frontier = list(
            chain.from_iterable(G.successors(each) for each in H.nodes()))
    elif direction is 'in':
        frontier = list(
            chain.from_iterable(G.predecessors(each) for each in H.nodes()))
    return nx.compose(H, G.subgraph(frontier + H.nodes()))
示例#17
0
文件: models.py 项目: XI-lab/axel
    def wikilinks_graph(self):
        """
        Generate a wikilinks graph using networkx
        :rtype: Graph
        """
        import tempfile
        from networkx.readwrite import json_graph
        import networkx as nx
        import re
        import requests

        tmpdir = tempfile.gettempdir()
        graph_object = tmpdir + '/' + str(self.id) + '.wikilinks.json'

        def _get_links(ngram):
            ngram_links = json.loads(requests.get(template_query.format(ngram)).text)
            try:
                ngram_links = ngram_links['query']['pages'].values()[0]['links']
            except KeyError:
                return []
            ngram_links = [re.sub(r' \(.+\)', '', link['title'].lower()) for link in ngram_links]
            ngram_links = set([ngram for ngram in ngram_links if len(ngram.split()) > 1])
            return ngram_links

        if not os.path.exists(graph_object):
            graph = nx.Graph()
            links_dict = {}
            template_query = u'http://en.wikipedia.org/w/api.php?action=query&titles={0}&prop=links&plnamespace=0&pllimit=500&format=json'
            article_ngrams = list(self.articlecollocation_set.values_list('ngram', flat=True))
            for i, ngram1 in enumerate(article_ngrams):
                if ngram1 in links_dict:
                    ngram1_links = links_dict[ngram1]
                else:
                    ngram1_links = _get_links(ngram1)
                    links_dict[ngram1] = ngram1_links
                for j in range(i+1, len(article_ngrams)):
                    ngram2 = article_ngrams[j]
                    if ngram2 in links_dict:
                        ngram2_links = links_dict[ngram2]
                    else:
                        ngram2_links = _get_links(ngram2)
                        links_dict[ngram2] = ngram2_links
                    if ngram1 in ngram2_links or ngram2 in ngram1_links:
                        graph.add_edge(ngram1, ngram2)
            json_graph.dump(graph, open(graph_object, 'w'))
            return graph

        else:
            graph = json_graph.load(open(graph_object))
            return graph
示例#18
0
def concept_nbh(name, levels):
    G = json_graph.load(open("static/local_instance.json"))
    G = G.subgraph(get_id(name))

    #the user might not know his numbers very well
    try:
        levels = int(levels)
    except:
        levels = 0

    for i in range(levels):
        G = expand_graph(G)

    data = json_graph.dumps(G)
    return data
示例#19
0
def concept_nbh(name, levels):
	G=json_graph.load(open("static/local_instance.json"))
	G=G.subgraph(get_id(name))
	
	#the user might not know his numbers very well
	try:
		levels=int(levels)
	except:
		levels=0

	for i in range(levels):
		G=expand_graph(G)

	data=json_graph.dumps(G)
	return data
示例#20
0
def show_nca(name1, name2, levels=0):
	nca_list=nca(name1, name2)
	G=json_graph.load(open("static/local_instance.json"))
	H=nx.DiGraph()
	for each in nca_list:
		anc_path=nx.compose(color_path(each[0],'green'),color_path(each[1],'yellow'))
		H=nx.compose(H,anc_path)

	for i in range(levels):
		H=expand_graph(H)

	for each in nca_list:
		H.node[each[0][0]]['color']='red' #color the nca different

	data=json_graph.dumps(H)
	return data
示例#21
0
def show_nca(name1, name2, levels=0):
    nca_list = nca(name1, name2)
    G = json_graph.load(open("static/local_instance.json"))
    H = nx.DiGraph()
    for each in nca_list:
        anc_path = nx.compose(color_path(each[0], 'green'),
                              color_path(each[1], 'yellow'))
        H = nx.compose(H, anc_path)

    for i in range(levels):
        H = expand_graph(H)

    for each in nca_list:
        H.node[each[0][0]]['color'] = 'red'  #color the nca different

    data = json_graph.dumps(H)
    return data
示例#22
0
def similarity(name1, name2, content_field='content'):
    G = json_graph.load(open("static/local_instance.json"))
    H = nx.DiGraph()
    path = nx.shortest_path(G, get_id(name1), get_id(name2))

    diameter = 0
    with open("static/diameter.dat", "r") as f:
        diameter = float(f.read())

    f1 = len(path) / float(diameter)
    print "minpath factor: %f" % f1

    nca_list = nca(name1, name2)
    #do something with the information
    #print "nca factor: %f"%f2

    f3 = content_similarity(G.node[get_id(name1)][content_field], name2)
    print "content sim factor: %f" % f3
示例#23
0
def similarity(name1, name2,content_field='content'):
	G=json_graph.load(open("static/local_instance.json"))
	H=nx.DiGraph();
	path=nx.shortest_path(G,get_id(name1),get_id(name2))

	diameter=0
	with open("static/diameter.dat", "r") as f:
		diameter=float(f.read())

	f1=len(path)/float(diameter)
	print "minpath factor: %f"%f1


	nca_list=nca(name1,name2)
	#do something with the information
	#print "nca factor: %f"%f2

	f3=content_similarity(G.node[get_id(name1)][content_field], name2)
	print "content sim factor: %f"%f3
示例#24
0
def trainCorpus(content_field='content'):
	G=json_graph.load(open("static/local_instance.json"))
	sentences=[]
	
	for each_node in G.nodes():
		sentences.append(G.node[each_node][content_field]) 

	#pre-process text: convert to lowercase, remove numbers/symbols, remove stopwords
	texts=[]
	for string in sentences:
		words = re.findall(r'[a-z]+', string.lower())
		imp_words = filter(lambda x: x not in stopwords.words('english'), words)
		texts.append(imp_words)

	dictionary = corpora.Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]
	lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100) #num_topics should be around 200 for a large corpus
	index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
	
	#locally store computed information
	dictionary.save('static/local_dict.dict')
	corpora.MmCorpus.serialize('static/local_corpus.mm', corpus) 
	index.save('static/local_index.index')
	lsi.save('static/local_lsi.lsi')
 def read_json_file(self, filename):
     self.G = json_graph.load(open(filename))
     print "Read in file ", filename
 def read_json_file(self, filename):
     self.G = json_graph.load(open(filename))
     print "Read in file ", filename
示例#27
0
def read_json_file(filename, info=True):
    graph = json_graph.load(open(filename))
    if info:
        print("Read in file ", filename)
        print(nx.info(graph))
    return graph
示例#28
0
def read_json_file(filename, info=True):
    graph = json_graph.load(open(filename))
    if info:
        print "Read in file ", filename
        print nx.info(graph)
    return graph
示例#29
0
def local_mega_graph():
    H = json_graph.load(open("static/local_instance.json"))
    data = json_graph.dumps(H)
    return data
示例#30
0
def local_mega_graph():
	H=json_graph.load(open("static/local_instance.json"))
	data=json_graph.dumps(H)
	return data
def read_json_file(filename):
    graph = json_graph.load(open(filename))
    print "Read in file", filename
    print nx.info(graph).replace("\n"," | ")
    return graph
示例#32
0
文件: graph.py 项目: rainest/grit
def main():
    LOGGER.setLevel(logging.DEBUG)
    APILOGGER.setLevel(logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('seed', metavar='U', type=str, nargs='+',\
            help='seed users')
    parser.add_argument('--saved', dest='savefile', metavar='SAVED', type=str,\
            help='saved progress .json', default='')
    args = parser.parse_args()
    oauth = {}
    with open('oauth.json') as raw:
        oauth = json.load(raw)
        
    tapi = api.Wrapper(oauth['access'], oauth['accessSecret'],\
            oauth['consumer'], oauth['consumerSecret'])
    if args.savefile:
        with open(args.savefile) as saved:
            graph = nxjson.load(saved)
    else:
        graph = nx.DiGraph()
        seed = set(map(int, args.seed))
        users = seed.union(*[tapi.followers(acct) for acct in seed])
        graph.add_nodes_from(users)
 
    users = graph.nodes()
    progress = 0
    total = 0
    for user in users:
        total += 1
        if graph.node[user].get('username'):
            continue
        info = tapi.info(user)
        if not info.get('screen_name') or info.get('protected'):
            graph.remove_node(user)
            continue
        progress += 1
        LOGGER.debug('Adding user %s aka @%s', user, info.get('screen_name'))
        graph.add_node(user, name=info.get('name',''), location=info.get('location',''),\
                followers=info.get('followers_count',0), lang=info.get('lang',''),\
                following=info.get('friends_count',0), username=info.get('screen_name',''),\
                protected=info.get('protected'))
        if progress == 100:
            progress = 0
            with open('tmp.json', 'w') as garph:
                nxjson.dump(graph, garph)
            LOGGER.info('Saved info for %s/%s users', total, len(users))

    LOGGER.info('Info collected')
    with open('garph.json', 'w') as garph:
        nxjson.dump(graph, garph)
    nodeSet = set(graph.nodes())
    users = graph.nodes() # Removed some nodes earlier, can't try to access them
    progress = 0

    while users: # for some reason no edges to seed, but they are in users because we got their data
        user = users.pop()
        if graph.node[user].get('complete'):
            continue
        graph.node[user]['complete'] = True
        try:
            followers = tapi.followers(user)
        except api.NoDataError:
            continue
        relevant = followers & nodeSet
        graph.node[user]['follower_ids'] = ','.join(map(str, followers))
        LOGGER.debug('Adding edges for user %s aka @%s', user, graph.node[user]['username'])
        graph.add_edges_from([(follower, user) for follower in relevant])
        progress += 1
        if progress == 5:
            progress = 0
            with open('tmp.json', 'w') as garph:
                nxjson.dump(graph, garph)
            LOGGER.info('Saved graph with %s user edgesets remaining', len(users))

    with open('garph.json', 'w') as garph:
        nxjson.dump(graph, garph)
    nx.write_gexf(graph, 'garph.gexf')
    LOGGER.info('Edges collected')
    LOGGER.info('Job complete')
示例#33
0
文件: models.py 项目: XI-lab/axel
    def dbpedia_graph(self, redirects=True):
        """
        Generate a dbpedia category TREE using networkx
        :rtype: nx.Graph
        """
        import tempfile
        import requests
        from networkx.readwrite import json_graph
        tmpdir = tempfile.gettempdir()
        if redirects:
            graph_object = tmpdir + '/' + str(self.id) + 'redirects.' + '.dbpedia.json'
        else:
            graph_object = tmpdir + '/' + str(self.id) + '.dbpedia.json'
        if not os.path.exists(graph_object):

            stop_uris_set = open(settings.ABS_PATH('stop_uri.txt')).read().split()
            stop_uris_set = set([x.split('/')[-1] for x in stop_uris_set])

            def recurse_populate_graph(resource, graph, depth):
                if resource in stop_uris_set:
                    return
                if depth == 0:
                    return
                if 'Category' in resource:
                    query = u'SELECT ?broader, ?related, ?broaderof WHERE' \
                            u' {{{{ <http://dbpedia.org/resource/{0}> skos:broader ?broader }}' \
                            u' UNION {{ ?broaderof skos:broader <http://dbpedia.org/resource/{0}> }}' \
                            u' UNION {{ ?related skos:related <http://dbpedia.org/resource/{0}> }}' \
                            u' UNION {{ <http://dbpedia.org/resource/{0}> skos:related ?related }}}}'.format(resource)

                    results = []
                    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
                    sparql.setReturnFormat(JSON)
                    sparql.setQuery(query)
                    results.extend(sparql.query().convert()['results']['bindings'])
                    for result in results:
                        for rel_type, value in result.iteritems():
                            uri = value['value']
                            parent_resource = uri.split('/')[-1]
                            #print '  ' * (3 - depth), resource, '->', parent_resource
                            graph.add_edge(resource, parent_resource, type=rel_type)
                            recurse_populate_graph(parent_resource, graph, depth-1)
                else:
                    if resource == 'cumulative gain':
                        resource = 'Discounted_cumulative_gain'
                    elif resource == 'world wide web conference':
                        resource = 'International_World_Wide_Web_Conference'
                    wiki_cat_query = u'http://en.wikipedia.org/w/api.php?action=query&titles={0}&prop=categories&cllimit=50&clshow=!hidden&format=json&redirects'
                    results = json.loads(requests.get(wiki_cat_query.format(resource)).text)['query']['pages'].values()[0]
                    if 'missing' in results:
                        results = json.loads(requests.get(wiki_cat_query.format(resource.title())).text)['query']['pages'].values()[0]
                        if 'missing' in results:
                            print results, resource
                            results = []
                        else:
                            results = [c['title'].replace(' ', '_') for c in results['categories']]
                    else:
                        results = [c['title'].replace(' ', '_') for c in results['categories']]
                    rel_type = "subject"
                    for parent_resource in results:
                        #print '  ' * (3 - depth), resource, '->', parent_resource
                        graph.add_edge(resource, parent_resource, type=rel_type)
                        recurse_populate_graph(parent_resource, graph, depth-1)

            import networkx as nx
            from SPARQLWrapper import SPARQLWrapper, JSON

            graph = nx.Graph()
            ngrams = set(self.articlecollocation_set.values_list('ngram', flat=True))
            ngrams = self.CollocationModel.COLLECTION_MODEL.objects.filter(ngram__in=ngrams)
            for ngram in ngrams:
                if 'dbpedia' in ngram.source or (redirects and 'wiki_redirect' in ngram.source):
                    recurse_populate_graph(ngram.ngram, graph, 2)

            json_graph.dump(graph, open(graph_object, 'w'))
        else:
            graph = json_graph.load(open(graph_object))
        return graph