def on_start(self): def parse_line(line): business = json.loads(line.strip()) business_id = business.get('business_id', None) requestUrl = 'http://www.dianping.com/shop/%s' % (business_id) self.crawl(requestUrl, callback=self.index_page) def parse_add_rep(line): url = line.strip().split("!@#$")[0] self.add_rep(url) day = '2015_7_12' RDD.txtfile('/search/data/dianping/result' + day).map(parse_line)
def on_start(self): def parse_line(line): business = json.loads(line.strip()) business_id = business.get('business_id',None) requestUrl = 'http://www.dianping.com/shop/%s'%(business_id) self.crawl(requestUrl, callback=self.index_page) def parse_add_rep(line): url = line.strip().split("!@#$")[0] self.add_rep(url) day = '2015_7_12' RDD.txtfile('/search/data/dianping/result'+day).map(parse_line)
def kmedoid_clustering(g, r, measure, num_cluster): all_rdds_df = pd.DataFrame() for target_one in g: rdd_list = [] for target_two in g: rdd_list.append( RDD.realworld_distance_compare(g, target_one, target_two, measure, r)) all_rdds_df[target_one] = rdd_list data = all_rdds_df np_of_rdds = np.array(data) kmedoids = KMedoids(n_clusters=num_cluster, random_state=0).fit(np_of_rdds) cluster_data = kmedoids.labels_ node_list = [] degree_list = [] rad_list = [] # Populate and construct a DataFrame with basic node information for node in g: node_list.append(node) degree_list.append(g.degree(node)) # TODO: Broken rad_list.append(1) df = pd.DataFrame({ 'node_name': node_list, 'radius': rad_list, 'degree': degree_list, 'cluster': cluster_data }) return df
def parallelize(self, c, numSlices=None): nextstage = self.getnextstage() self.values[nextstage] = c self.maxtask += 1 self.addmethod("parallelize", thisstage=nextstage, thistask=self.maxtask) return RDD(self, stage=nextstage, task=self.maxtask)
def generate_blocks(self, user_id, job_profile): # generate all rdds and blocks for stage_id in job_profile.keys(): task_num = job_profile[stage_id]['Task_Num'] rdd_name = 'user%s_rdd%s' % (user_id, stage_id) this_rdd = RDD( rdd_name, task_num ) # task_num in this stage = partition_num of the generated rdd block_list = list() for index in range(0, task_num): this_block = Block( this_rdd, this_rdd.name, index, user_id ) # block size will be choson randamly according to the co-flow trace block_list.append(this_block) this_rdd.set_blocklist(block_list) self.rdd_list.append(this_rdd)
def executeRDD(parent, func_name, params): if func_name == "RDD" and parent == "head": return RDD(eval(params)) param = () if len(params) > 0: param = eval(params) if hasattr(param, '__call__'): return getattr(parent, func_name)(param) return getattr(parent, func_name)(*param)
def visualize_rdd(g1, u, r, pos, m=measures.global_graph_degree): """takes a graph and plots it, coloring vertices by RDD Args: ----- g1: a networkx graph u: source node m: a measure function from measures r: target radius Returns: -------- fig: a figure object of a scatter plot """ df = RDD.get_rdds_for_visuals(g1, u, m, r) # pos = spring_layout(g1) # pos = nx.spring_layout(g1, scale=5) nodes_x = [] nodes_y = [] for p in pos.values(): x, y = p[0], p[1] nodes_x.append(x) nodes_y.append(y) df['nodes_x'] = nodes_x df['nodes_y'] = nodes_y edges_x = [] edges_y = [] for e in g1.edges(): x0, y0 = pos[e[0]] x1, y1 = pos[e[1]] edges_x.append(x0) edges_x.append(x1) edges_x.append(None) edges_y.append(y0) edges_y.append(y1) edges_y.append(None) fig = go.Figure() fig.add_trace(go.Scatter(x=edges_x, y=edges_y, name='edges', mode='lines', line={'width': 1})) fig.add_trace(go.Scatter(x=df['nodes_x'], y=df['nodes_y'], customdata=df[['rdd', 'degree']].values, hovertemplate="Node: %{text} <br> RDD: %{customdata[0]} <br> Degree: %{customdata[1]} <extra></extra>", text=df['node_name'], name="nodes", mode='markers+text')) fig.update_layout(template="plotly_dark", dragmode='pan') fig.update_traces(marker={'size': 10, 'color': df['rdd'], 'colorscale': 'Jet'}) fig.write_html("graph.html", config={'scrollZoom': True}) return fig.show(config={'scrollZoom': True})
def textFile(self, name, minPartitions=None, use_unicode=False): nextstage = self.getnextstage() if self.realsc: self.values[nextstage] = self.realsc.textFile( name, minPartitions=minPartitions, use_unicode=use_unicode).collect() else: self.values[nextstage] = self.values[0] self.maxtask += 1 self.addmethod("textFile", thisstage=nextstage, thistask=self.maxtask) return RDD(self, stage=nextstage, task=self.maxtask)
def visualize_rdd_vector(g1, u, r, pos, measure_vector): df = RDD.get_rdds_for_visuals_vector(g1, u, measure_vector, r) # pos = nx.spring_layout(g1) nodes_x = [] nodes_y = [] for node_name in g1.nodes: x, y = pos[node_name][0], pos[node_name][1] nodes_x.append(x) nodes_y.append(y) # for p in pos.values(): # x, y = p[0], p[1] # nodes_x.append(x) # nodes_y.append(y) df['nodes_x'] = nodes_x df['nodes_y'] = nodes_y edges_x = [] edges_y = [] for e in g1.edges(): x0, y0 = pos[e[0]] x1, y1 = pos[e[1]] edges_x.append(x0) edges_x.append(x1) # why do these need to be here? edges_x.append(None) edges_y.append(y0) edges_y.append(y1) # why do these need to be here? edges_y.append(None) # get the custom_data and build the hover template from the DataFrame column names custom_data = df.columns hover_template = "" for i, m in enumerate(custom_data): hover_template += "".join(m + ":" + ' %{customdata[' + str(i) + ']} <br> ') fig = go.FigureWidget() fig.add_trace(go.Scatter(x=edges_x, y=edges_y, name='edges', mode='lines', line={'width': 1})) fig.add_trace(go.Scatter(x=df['nodes_x'], y=df['nodes_y'], customdata=df[custom_data], hovertemplate=hover_template, text=list(g1.nodes), # text=['node_name'], name="Node", mode='markers+text')) fig.update_layout(template="plotly_dark", dragmode='pan', ) fig.update_traces(marker={'size': 15, 'color': df['normalized_rdd'], 'colorscale': 'Jet'}) fig.write_html("graph.html", config={'scrollZoom': True}) return fig
def agglomerative_hierarchical_clustering(g, r, measure, num_cluster): all_rdds_df = pd.DataFrame() for target_one in g: rdd_list = [] for target_two in g: rdd_list.append( RDD.realworld_distance_compare(g, target_one, target_two, measure, r)) all_rdds_df[target_one] = rdd_list data = all_rdds_df # dend = shc.dendrogram(shc.linkage(data, method='ward')) cluster = AgglomerativeClustering(n_clusters=num_cluster, affinity='euclidean', linkage='ward') cluster_data = cluster.fit_predict(data) node_list = [] degree_list = [] rad_list = [] # Populate and construct a DataFrame with basic node information for node in g: node_list.append(node) degree_list.append(g.degree(node)) # TODO: Broken rad_list.append(1) df = pd.DataFrame({ 'node_name': node_list, 'radius': rad_list, 'degree': degree_list, 'cluster': cluster_data }) return df
from rdd import RDD, toTSVLine rdd = RDD("./datasets/albums.csv") # map every line to key/value pairs for fast reduce and sorting albums = rdd.map(lambda line: (int(line.split(',')[1]), 1)) added = albums.reduceByKey(lambda x, y: x + y) completelysorted = added.sortByKey().sortBy(lambda x: x[1], ascending=False) # Save to TSV file lines = completelysorted.map(toTSVLine) lines.saveAsTextFile('./datasets/result_4')
def emptyRDD(self): nextstage = self.getnextstage() self.values[nextstage] = [] self.maxtask += 1 self.addmethod("emptyRDD", thisstage=nextstage, thistask=self.maxtask) return RDD(self, stage=nextstage, task=self.maxtask)
def visualize_rdd_vector_mean_shift(g1, u, r, measure_vector, pos, vistype=1): df = RDD.get_rdds_for_visuals_vector(g1, u, measure_vector, r) df = other_sims.mean_shift(df, measure_vector) # pos = nx.spring_layout(g1) nodes_x = [] nodes_y = [] for p in pos.values(): x, y = p[0], p[1] nodes_x.append(x) nodes_y.append(y) df['nodes_x'] = nodes_x df['nodes_y'] = nodes_y edges_x = [] edges_y = [] for e in g1.edges(): x0, y0 = pos[e[0]] x1, y1 = pos[e[1]] edges_x.append(x0) edges_x.append(x1) edges_x.append(None) edges_y.append(y0) edges_y.append(y1) edges_y.append(None) # get the custom_data and build the hover template from the DataFrame column names custom_data = df.columns hover_template = "" for i, m in enumerate(custom_data): hover_template += "".join(m + ":" + ' %{customdata[' + str(i) + ']} <br> ') fig = go.FigureWidget() fig.add_trace(go.Scatter(x=edges_x, y=edges_y, name='edges', mode='lines', line={'width': 1})) fig.add_trace(go.Scatter(x=df['nodes_x'], y=df['nodes_y'], customdata=df[custom_data], hovertemplate=hover_template, text=df['node_name'], name="Node", mode='markers')) fig.update_layout(template="plotly_dark", dragmode='pan', annotations=[{'x': df.iloc[0]['nodes_x'], 'y': df.iloc[0]['nodes_y'], 'axref': 'x', 'ayref': 'y', 'arrowsize': 4, 'arrowcolor': 'red', 'showarrow': True, 'arrowhead': 3}]) if vistype == 0: fig.update_traces( marker={'size': ((df['cluster'] * 5) + 10), 'color': df['normalized_rdd'], 'colorscale': 'Jet'}) elif vistype == 1: fig.update_traces( marker={'size': 15, 'color': df['cluster'], 'colorscale': 'Jet'}) else: print('enter proper vistype') fig.write_html("graph.html", config={'scrollZoom': True}) return fig
from rdd import RDD rdd1 = RDD('./datasets/artists.csv') rdd2 = RDD('./datasets/albums.csv') def find_name(line): if line[2]: return line[2] return line[1] # (artist_id, artist_name) norwegian_artists = rdd1.map(lambda line: line.split(',')).filter( lambda x: x[5] == 'Norway').map(lambda y: (y[0], find_name(y))) # (artist_id, mtv_review) albums = rdd2.map(lambda line: line.split(',')).map(lambda x: (x[1], float(x[8]))) # (artist_id, (artist_name, mtv_review)) norwegian_albums = norwegian_artists.join(albums) # (artist_id, (artist_name, avg_critic)) reduced = norwegian_albums.reduceByKey(lambda x, y: (x[0], (x[1] + y[1]) / 2)) # Save as TSV file reduced.map(lambda x: '{name}\tNorway\t{mtv}'.format(name=x[1][0], mtv=x[1][1]) ).saveAsTextFile("./datasets/result_9")
from rdd import RDD # Make RDD of artists, map year of birth and find min value. year = RDD("./datasets/artists.csv").map(lambda line: line.split(",")[4]).min() print("The oldest artist was born in {}.".format(year))
from rdd import RDD, toTSVLine from operator import add # RDD from text file rdd = RDD('./datasets/albums.csv') # Create key/value pairs of (genre, tracks sold). List is alreay sorted by id, thus we don't need id genres = rdd.map(lambda line: (''.join(line.split(',')[3]), int(line.split(',')[6]))) # Aggregate all genres and sum salesnumbers sortbysales = genres.reduceByKey(add) # sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order completed = sortbysales.sortByKey().sortBy(lambda x: x[1], ascending=False) # Save to TSV file lines = completed.map(toTSVLine) lines.saveAsTextFile('./datasets/result_5')
from rdd import RDD from pyspark import SparkContext, SparkConf # RDD from text file rdd = RDD('./datasets/albums.csv') # Create key/value pairs of (album id, average critic) critics = rdd.map(lambda line: line.split(',')).map( lambda x: (x[0], (float(x[7]) + float(x[8]) + float(x[9])) / 3)) # sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order sortedreview = critics.sortByKey().sortBy(lambda x: x[1], ascending=False) # Get the 10 best albums based on avg critic sc = SparkContext.getOrCreate(SparkConf()) top = sc.parallelize(c=sortedreview.take(10)) # Save as TSV file. set coalesce(1) so that we can use this file in Task 7 top.map(lambda x: '{album}\t{avg}'.format(album=x[0], avg=x[1])).coalesce( 1).saveAsTextFile("./datasets/result_6")
from rdd import RDD count = RDD("./datasets/albums.csv").map(lambda line: line.split(",")[3]).distinct().count() print("There are {} distinct genres in albums.csv.".format(count))