def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i + 1, count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' + search_param + ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i+1,count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' +search_param+ ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def compare_visualization(product_sku, compare_phrase): all_reviews = ReviewInfo.objects.all().filter(sku=product_sku) g = Graph() count = 0.0 for e in all_reviews : s = e.comment.lower() s = plaintext(s) s = parsetree(s) #p = '{NP} (VP) faster than {NP}' p = '{NP} (VP) ' + compare_phrase + ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A count += 1.0 print count/len(all_reviews), '\r' if len(g) > 0: g = g.split()[0] # Largest subgraph. for n in g.sorted()[:80]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('static/compare_visualization', directed=True, weighted=2.0) return True else: return False
def make_graph(dgram, n, numWord): if n == 1: graph = Graph(distance=4.0) center = graph.add_node(' ', radius=0) center.fill = (0,0,0,0) for gram in dgram: key = gram w = dgram[gram] / numWord node = graph.add_node(key, centrality=w, radius=dgram[gram] + 1) node.fill = (0, 0.5, 1, node.radius * 0.1) graph.add_edge(center, node, length=2000/node.radius, stroke=(0,0,0,0)) # R,G,B,A graph.export('/home/matrx63/Web/monogram', pack=False, width='2000', height='2000', frames=5000, ipf=30)
def render_graph(self, domains): """renders graph output""" g = Graph() for domain in domains.keys(): if domain in self.cat_dict: categories = self.cat_dict[domain] stroke = (0, 0, 0, 0.5) if 'right' in categories: stroke = (255, 0, 0, 1) elif 'right_center' in categories: stroke = (255, 0, 0, .5) if 'left' in categories: stroke = (0, 0, 255, 1) elif 'left_center' in categories: stroke = (0, 0, 255, .5) if 'least_biased' in categories: stroke = (0, 255, 0, 1) fill = (128, 128, 0, 0.1) dub_cats = [ 'fake', 'questionable', 'clickbait', 'unreliable', 'conspiracy' ] score = len([c for c in categories if c in dub_cats]) if score: fill = (0, 0, 0, float(score) / 5) g.add_node(domain, radius=len(domains[domain]) * 6, stroke=stroke, strokewidth=6, fill=fill, font_size=30) pairs = self.pairwise(domains.keys()) for x, y in pairs: x_queries = set(domains[x]) y_queries = set(domains[y]) intersection = len(x_queries.intersection(y_queries)) if intersection > 0: max_rad = max(len(domains[x]), len(domains[y])) + 1000 g.add_edge(x, y, length=max_rad, strokewidth=intersection) path = 'graph' g.export(path, encoding='utf-8', distance=6, directed=False, width=1400, height=900)
def visualize_rel(self): orderedPairs = [] for i in range(len(self.subject_object_dict)): orderedPair = list( itertools.product( self.subject_object_dict["S" + str(i + 1)][0], self.subject_object_dict["S" + str(i + 1)][1])) orderedPairs.append(orderedPair) g = Graph() for node in (orderedPairs): for n1, n2 in node: g.add_node(n1) g.add_node(n2) g.add_edge(n1, n2, weight=0.0, type='is-related-to') g.export('FeatureRelations', directed=True) orig_stdout = sys.stdout gn = file('GraphNodeWeights.txt', 'a') sys.stdout = gn for n in sorted(g.nodes, key=lambda n: n.weight): print '%.2f' % n.weight, n sys.stdout = orig_stdout gn.close()
class LogStatGraph: def __init__(self, name=None): self.name = None self.graph = Graph() def load(self, log_stat): if self.name is None: self.name = log_stat.repo_name for commit in log_stat.commits: author_email = commit.ae self.graph.add_node(author_email, fill=BLACK_50) for diffstat in commit.diffstats: file_path = diffstat["file_path"] self.graph.add_node(file_path, stroke=BLACK_25, text=BLACK_15) self.graph.add_edge(author_email, file_path, stroke=BLACK_25) def prune(self, depth=0): self.graph.prune(depth) def export(self, path=None, **kwargs): if path is None: path = self.name self.graph.export(path, directed=True, weighted=True, **kwargs)
#g = g.split()[0] #print 'NUMBER OF NODES ' + str(len(g.nodes)) #node_removed=0 #for node in g.nodes: # if node.weight <= 0.07: # print node_removed # node_removed+=1 # g.remove(node) # g = g.split()[0] # g.eigenvector_centrality() #print '\nNUMBER OF NODES ' + str(len(g.nodes)) #for graph i g.split(): # graph.export('hashtags' + str(i),title='Hashtags Network',width=800, height=600, directed=False,repulsion=60) # i+=1 # # #on a que le sous graphe #g.split()[0].export('hashtags' ,title='Hashtags Network',width=900, height=550, directed=False,k=7,repulsion=60) #g.export('hashtags' ,title='Hashtags Network',width=900, height=550, directed=False,k=7,repulsion=60) g.export('sound', directed=True) #g.serialize() print '\n\ngraph exported'
import os import sys sys.path.insert(0, os.path.join("..", "..")) from pattern.graph import Graph, WEIGHT, CENTRALITY, DEGREE, DEFAULT from random import choice, random # This example demonstrates how a graph visualization can be exported to GraphML, # a file format that can be opened in Gephi (https://gephi.org). g = Graph() # Random nodes. for i in range(50): g.add_node(i) # Random edges. for i in range(75): node1 = choice(g.nodes) node2 = choice(g.nodes) g.add_edge(node1, node2, weight=random()) g.prune(0) # This node's label is different from its id. # FIXME this fails if the 1 has been pruned # g[1].text.string = "home" # By default, Graph.export() exports to HTML, # but if we give it a filename that ends in .graphml it will export to GraphML. g.export(os.path.join(os.path.dirname(__file__), "test.graphml"))
for i in range(1, 10): # Set cached=False for live results: for result in Twitter(language="en").search("\"is the new\"", start=i, count=100, cached=True): s = result.text s = s.replace("\n", " ") s = s.lower() s = s.replace("is the new", "NEW") s = s.split(" ") try: i = s.index("NEW") A = s[i - 1].strip("?!.:;,#@\"'") B = s[i + 1].strip("?!.:;,#@\"'") # Exclude common phrases such as "this is the new thing". if A and B and A not in ("it", "this", "here", "what", "why", "where"): comparisons.append((A, B)) except: pass g = Graph() for A, B in comparisons: e = g.add_edge(B, A) # "A is the new B": A <= B e.weight += 0.1 print(("%s => %s" % (B, A)).encode('utf-8')) # Not all nodes will be connected, there will be multiple subgraphs. # Simply take the largest subgraph for our visualization. g = g.split()[0] g.export("trends", weighted=True, directed=True)
# This node's label is different from its id. # We'll make it a hyperlink, see the href attribute at the bottom. # FIXME this fails if the 1 has been pruned # g[1].text.string = "home" # The export() command generates a folder with an index.html, # that displays the graph using an interactive, force-based spring layout. # You can drag the nodes around - open index.html in a browser and try it out! # The layout can be tweaked in many ways: g.export(os.path.join(os.path.dirname(__file__), "test"), width=700, # <canvas> width. height=500, # <canvas> height. frames=500, # Number of frames of animation. directed=True, # Visualize eigenvector centrality as an edge arrow? weighted=0.5, # Visualize betweenness centrality as a node shadow? # Keep clusters close together + visualize node weight as node radius? pack=True, distance=10, # Average edge length. k=4.0, # Force constant. force=0.01, # Force dampener. repulsion=50, # Force radius. # INLINE, DEFAULT, None or the path to your own stylesheet. stylesheet=DEFAULT, javascript=None, # Node.id => URL href={"1": "http://www.clips.ua.ac.be/pages/pattern-graph"}, css={"1": "node-link-docs"} # Node.id => CSS class. )
from pattern.web import Bing, plaintext from pattern.en import parsetree from pattern.search import search from pattern.graph import Graph g = Graph() for i in range(10): # for result in Bing().search('"more important than"', start=i+1, for result in Bing().search('"is less important than"', start=i+1, count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) #p = '{NP} (VP) more important than {NP}' p = '{NP} (VP) is less important than {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('test', directed=True, weighted=0.6)
for i in range(1,10): # Set cached=False for live results: for result in Twitter(language="en").search("\"is the new\"", start=i, count=100, cached=True): s = result.text s = s.replace("\n", " ") s = s.lower() s = s.replace("is the new", "NEW") s = s.split(" ") try: i = s.index("NEW") A = s[i-1].strip("?!.:;,#@\"'") B = s[i+1].strip("?!.:;,#@\"'") # Exclude common phrases such as "this is the new thing". if A and B and A not in ("it", "this", "here", "what", "why", "where"): comparisons.append((A,B)) except: pass g = Graph() for A, B in comparisons: e = g.add_edge(B, A) # "A is the new B": A <= B e.weight += 0.1 print B, "=>", A # Not all nodes will be connected, there will be multiple subgraphs. # Simply take the largest subgraph for our visualization. g = g.split()[0] g.export("trends", weighted=True, directed=True, overwrite=True)
class WebCrawler(): def __init__(self, args, depth=1): self.links = [WebPage(x) for x in args.url] self.depth = depth self.historyDb = WebsiteDatabase() self.done = False self.options = args self.results = {link.url.domain: Result() for link in self.links} self.cloudIndexer = CloudSearchIndexer.forDomainIndex("websites") if args.graph or args.rank: self.webGraph = Graph(distance=30.0) for link in self.links: self.webGraph.add_node(link.url.domain, radius=15, fill=(1, 0, 0, 0.5)) def __del__(self): self.cloudIndexer._commitToAmazon() def crawl(self): if len(self.links) < 1: self.done = True self.finish() return site = self.links.pop(0) if self.historyDb.wasPageVisited(site): print 'reading data' site = self.historyDb.readWebPage(site.url.string, isExternal=site.isExternal, depth=site.depth) else: print 'downloading' try: site.downloadContent() except HTTP404NotFound: return self.fail(site, "404 not found") except URLTimeout: return self.fail(site, "Timeout error") except URLError as err: return self.fail(site, str(err)) connected = True if site.depth == self.depth: connected = False self.historyDb.insertWebpage(site, connection=connected) self.historyDb.appendSession(site) for link in site.getLinks(): if self.isValidForQueue(link): if link.isExternal and (self.options.graph or self.options.rank): self.addDomainNode(link) if site.depth < self.depth: self.links.append(link) elif not link.isExternal and site.depth < self.depth: self.links.insert(0, link) if not self.historyDb.wasPageVisited(site): self.visit(site) site.cleanCashedData() def isValidForQueue(self, link): if link not in self.links and not link.url.anchor: if self.historyDb.isInThisSession(link): self.historyDb.insertRelation(link.parent, link) else: return True return False def addDomainNode(self, page): match = re.search("\.", page.url.domain) if not match: return if page.parent.url.domain == page.url.domain: return if self.webGraph.node(page.url.domain) is None: self.webGraph.add_node(page.url.domain, radius=15) if self.webGraph.edge(page.parent.url.domain, page.url.domain) is None: self.webGraph.add_edge(page.parent.url.domain, page.url.domain, weight=0.0, type='is-related-to') def visit(self, page): print 'visited: ', page.url.string, ' domain: ', page.url.domain, 'graph', self.options.graph self.cloudIndexer.addDocument(page) if page.isExternal and self.options.graph and page.url.domain not in self.results.keys( ): self.webGraph.node(page.url.domain).fill = (0, 1, 0, 0.5) try: if self.options.text: self.results[page.url.domain].wordStats += page.countWords() if self.options.a: links = [link.url.string for link in page.getLinks()] self.results[page.url.domain].links.update(links) if self.options.image: self.results[page.url.domain].images.update(page.getImages()) if self.options.script: self.results[page.url.domain].scripts.update(page.getScripts()) except Exception as e: print "Error parsing document: ", type(e).__name__ + ': ' + str(e) def fail(self, link, error): print 'failed:', link.url.string, 'err: ', error def finish(self): """Print all results and calculate cosine similarity between all provided ur;s""" self.historyDb.clearSession() with Emitter(self.options.console, self.options.file) as output: for key, value in self.results.iteritems(): output.emitLine(key) value.emit(output) if len(self.results ) > 1 and self.options.text and self.options.cos: combinations = [ list(x) for x in itertools.combinations(self.results.keys(), 2) ] for pair in combinations: cosValue = self.results[pair[0]].cosineSimilarity( self.results[pair[1]]) output.emitLine( u"cos similarity between:{0} and {1} = {2}".format( pair[0], pair[1], cosValue)) output.emitLine('') #output.emitLine("max depth: " + str(max(site.depth for site in self.history))) #output.emitLine("sites visited: " + str(len(self.history))) if self.options.graph: self.webGraph.eigenvector_centrality() self.webGraph.export('graph', directed=True, width=2200, height=1600, repulsion=10) if self.options.rank: ranks = self.calculatePageRank() output.emitLine('') output.emit(ranks) def calculatePageRank(self): adjMap = adjacency(self.webGraph, directed=True, stochastic=True) domains = adjMap.keys() M = np.zeros((len(domains), len(domains))) for idx, domain in enumerate(domains): connections = adjMap[domain].keys() for connection in connections: M[idx, domains.index(connection)] = adjMap[domain][connection] M = np.transpose(M) #M = np.array([[0,0,0,0,1], [0.5,0,0,0,0], [0.5,0,0,0,0], [0,1,0.5,0,0], [0,0,0.5,1,0]]) #M = np.array([[0, 0.5, 0],[0.5,0.5, 0], [0.5, 0, 0]]) pageScores = self.executeComputations(M) print pageScores ranks = dict(zip(domains, pageScores)) ranks = sorted(ranks.items(), key=operator.itemgetter(1)) return ranks def executeComputations(self, M): damping = 0.80 error = 0.0000001 N = M.shape[0] v = np.ones(N) v = v / np.linalg.norm(v, 1) last_v = np.full(N, np.finfo(float).max) for i in range(0, N): if sum(M[:, i]) == 0: M[:, i] = np.full(N, 1.0 / N) M_hat = np.multiply(M, damping) + np.full((N, N), (1 - damping) / N) while np.linalg.norm(v - last_v) > error: last_v = v v = np.matmul(M_hat, v) return np.round(v, 6)
from pattern.graph import Graph import webbrowser g = Graph() n1 = "asdasd" n2 = "two" n3 = "three" n4 = "four" n5 = "five" g.add_node(n1) g.add_node(n2) g.add_node(n3) g.add_node(n4) g.add_node(n5) g.add_edge(n2, n3) g.add_edge(n3, n4) g.add_edge(n4, n5) """for n1, n2 in ( ('cat', 'tail'), ('cat', 'purr'), ('purr', 'sound'), ('dog', 'tail'), ('dog', 'bark'), ('bark', 'sound')): g.add_node(n1) g.add_node(n2) g.add_edge(n1, n2, weight=0.0, type='is-related-to')""" g.export('sound') webbrowser.open( u"file:///Users/tobiasfuma/Desktop/FirmenbuchCrawler/sound/index.html")
g = Graph() for i in range(10): for result in Bing().search('"more important than"', start=i+1, count=50): s = r.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) more important than {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('test', directed=True, weighted=0.6)