示例#1
0
    def on_start(self):
        def parse_line(line):
            business = json.loads(line.strip())
            business_id = business.get('business_id', None)
            requestUrl = 'http://www.dianping.com/shop/%s' % (business_id)
            self.crawl(requestUrl, callback=self.index_page)

        def parse_add_rep(line):
            url = line.strip().split("!@#$")[0]
            self.add_rep(url)

        day = '2015_7_12'

        RDD.txtfile('/search/data/dianping/result' + day).map(parse_line)
示例#2
0
    def on_start(self):

        def parse_line(line):
            business = json.loads(line.strip())
            business_id = business.get('business_id',None) 
            requestUrl = 'http://www.dianping.com/shop/%s'%(business_id)
            self.crawl(requestUrl, callback=self.index_page)

        def parse_add_rep(line):
            url = line.strip().split("!@#$")[0]
            self.add_rep(url)

        day = '2015_7_12'

        RDD.txtfile('/search/data/dianping/result'+day).map(parse_line)
示例#3
0
def kmedoid_clustering(g, r, measure, num_cluster):
    all_rdds_df = pd.DataFrame()

    for target_one in g:
        rdd_list = []
        for target_two in g:
            rdd_list.append(
                RDD.realworld_distance_compare(g, target_one, target_two,
                                               measure, r))
        all_rdds_df[target_one] = rdd_list

    data = all_rdds_df
    np_of_rdds = np.array(data)
    kmedoids = KMedoids(n_clusters=num_cluster, random_state=0).fit(np_of_rdds)
    cluster_data = kmedoids.labels_

    node_list = []
    degree_list = []
    rad_list = []
    # Populate and construct a DataFrame with basic node information
    for node in g:
        node_list.append(node)
        degree_list.append(g.degree(node))
        # TODO: Broken
        rad_list.append(1)

    df = pd.DataFrame({
        'node_name': node_list,
        'radius': rad_list,
        'degree': degree_list,
        'cluster': cluster_data
    })

    return df
示例#4
0
 def parallelize(self, c, numSlices=None):
     nextstage = self.getnextstage()
     self.values[nextstage] = c
     self.maxtask += 1
     self.addmethod("parallelize",
                    thisstage=nextstage,
                    thistask=self.maxtask)
     return RDD(self, stage=nextstage, task=self.maxtask)
示例#5
0
    def generate_blocks(self, user_id, job_profile):

        # generate all rdds and blocks
        for stage_id in job_profile.keys():
            task_num = job_profile[stage_id]['Task_Num']
            rdd_name = 'user%s_rdd%s' % (user_id, stage_id)
            this_rdd = RDD(
                rdd_name, task_num
            )  # task_num in this stage = partition_num of the generated rdd
            block_list = list()
            for index in range(0, task_num):
                this_block = Block(
                    this_rdd, this_rdd.name, index, user_id
                )  # block size will be choson randamly according to the co-flow trace
                block_list.append(this_block)
            this_rdd.set_blocklist(block_list)
            self.rdd_list.append(this_rdd)
示例#6
0
def executeRDD(parent, func_name, params):
    if func_name == "RDD" and parent == "head":
        return RDD(eval(params))
    param = ()
    if len(params) > 0:
        param = eval(params)
    if hasattr(param, '__call__'):
        return getattr(parent, func_name)(param)
    return getattr(parent, func_name)(*param)
示例#7
0
def visualize_rdd(g1, u, r, pos, m=measures.global_graph_degree):
    """takes a graph and plots it, coloring vertices by RDD

    Args:
    -----
        g1: a networkx graph
        u: source node
        m: a measure function from measures
        r: target radius


    Returns:
    --------
        fig: a figure object of a scatter plot

    """
    df = RDD.get_rdds_for_visuals(g1, u, m, r)
    # pos = spring_layout(g1)
    # pos = nx.spring_layout(g1, scale=5)
    nodes_x = []
    nodes_y = []


    for p in pos.values():
        x, y = p[0], p[1]
        nodes_x.append(x)
        nodes_y.append(y)

    df['nodes_x'] = nodes_x
    df['nodes_y'] = nodes_y

    edges_x = []
    edges_y = []
    for e in g1.edges():
        x0, y0 = pos[e[0]]
        x1, y1 = pos[e[1]]
        edges_x.append(x0)
        edges_x.append(x1)
        edges_x.append(None)
        edges_y.append(y0)
        edges_y.append(y1)
        edges_y.append(None)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=edges_x, y=edges_y, name='edges', mode='lines', line={'width': 1}))
    fig.add_trace(go.Scatter(x=df['nodes_x'],
                             y=df['nodes_y'],
                             customdata=df[['rdd', 'degree']].values,
                             hovertemplate="Node: %{text} <br> RDD: %{customdata[0]} <br> Degree: %{customdata[1]} <extra></extra>",
                             text=df['node_name'],
                             name="nodes",
                             mode='markers+text'))
    fig.update_layout(template="plotly_dark", dragmode='pan')
    fig.update_traces(marker={'size': 10, 'color': df['rdd'], 'colorscale': 'Jet'})
    fig.write_html("graph.html", config={'scrollZoom': True})
    return fig.show(config={'scrollZoom': True})
示例#8
0
 def textFile(self, name, minPartitions=None, use_unicode=False):
     nextstage = self.getnextstage()
     if self.realsc:
         self.values[nextstage] = self.realsc.textFile(
             name, minPartitions=minPartitions,
             use_unicode=use_unicode).collect()
     else:
         self.values[nextstage] = self.values[0]
     self.maxtask += 1
     self.addmethod("textFile", thisstage=nextstage, thistask=self.maxtask)
     return RDD(self, stage=nextstage, task=self.maxtask)
示例#9
0
def visualize_rdd_vector(g1, u, r, pos, measure_vector):
    df = RDD.get_rdds_for_visuals_vector(g1, u, measure_vector, r)
    # pos = nx.spring_layout(g1)
    nodes_x = []
    nodes_y = []

    for node_name in g1.nodes:
        x, y = pos[node_name][0], pos[node_name][1]
        nodes_x.append(x)
        nodes_y.append(y)

    # for p in pos.values():
    #     x, y = p[0], p[1]
    #     nodes_x.append(x)
    #     nodes_y.append(y)

    df['nodes_x'] = nodes_x
    df['nodes_y'] = nodes_y

    edges_x = []
    edges_y = []
    for e in g1.edges():
        x0, y0 = pos[e[0]]
        x1, y1 = pos[e[1]]
        edges_x.append(x0)
        edges_x.append(x1)
        # why do these need to be here?
        edges_x.append(None)
        edges_y.append(y0)
        edges_y.append(y1)
        # why do these need to be here?
        edges_y.append(None)
    # get the custom_data and build the hover template from the DataFrame column names
    custom_data = df.columns
    hover_template = ""
    for i, m in enumerate(custom_data):
        hover_template += "".join(m + ":" + ' %{customdata[' + str(i) + ']} <br> ')

    fig = go.FigureWidget()
    fig.add_trace(go.Scatter(x=edges_x, y=edges_y, name='edges', mode='lines', line={'width': 1}))
    fig.add_trace(go.Scatter(x=df['nodes_x'],
                             y=df['nodes_y'],
                             customdata=df[custom_data],
                             hovertemplate=hover_template,
                             text=list(g1.nodes),
                             # text=['node_name'],
                             name="Node",
                             mode='markers+text'))
    fig.update_layout(template="plotly_dark", dragmode='pan', )
    fig.update_traces(marker={'size': 15, 'color': df['normalized_rdd'], 'colorscale': 'Jet'})
    fig.write_html("graph.html", config={'scrollZoom': True})
    return fig
示例#10
0
def agglomerative_hierarchical_clustering(g, r, measure, num_cluster):
    all_rdds_df = pd.DataFrame()

    for target_one in g:
        rdd_list = []
        for target_two in g:
            rdd_list.append(
                RDD.realworld_distance_compare(g, target_one, target_two,
                                               measure, r))
        all_rdds_df[target_one] = rdd_list

    data = all_rdds_df
    # dend = shc.dendrogram(shc.linkage(data, method='ward'))

    cluster = AgglomerativeClustering(n_clusters=num_cluster,
                                      affinity='euclidean',
                                      linkage='ward')
    cluster_data = cluster.fit_predict(data)

    node_list = []
    degree_list = []
    rad_list = []
    # Populate and construct a DataFrame with basic node information
    for node in g:
        node_list.append(node)
        degree_list.append(g.degree(node))
        # TODO: Broken
        rad_list.append(1)

    df = pd.DataFrame({
        'node_name': node_list,
        'radius': rad_list,
        'degree': degree_list,
        'cluster': cluster_data
    })

    return df
示例#11
0
from rdd import RDD, toTSVLine

rdd = RDD("./datasets/albums.csv")

# map every line to key/value pairs for fast reduce and sorting
albums = rdd.map(lambda line: (int(line.split(',')[1]), 1))
added = albums.reduceByKey(lambda x, y: x + y)
completelysorted = added.sortByKey().sortBy(lambda x: x[1], ascending=False)

# Save to TSV file
lines = completelysorted.map(toTSVLine)
lines.saveAsTextFile('./datasets/result_4')
示例#12
0
 def emptyRDD(self):
     nextstage = self.getnextstage()
     self.values[nextstage] = []
     self.maxtask += 1
     self.addmethod("emptyRDD", thisstage=nextstage, thistask=self.maxtask)
     return RDD(self, stage=nextstage, task=self.maxtask)
示例#13
0
def visualize_rdd_vector_mean_shift(g1, u, r, measure_vector, pos, vistype=1):
    df = RDD.get_rdds_for_visuals_vector(g1, u, measure_vector, r)
    df = other_sims.mean_shift(df, measure_vector)
    # pos = nx.spring_layout(g1)
    nodes_x = []
    nodes_y = []

    for p in pos.values():
        x, y = p[0], p[1]
        nodes_x.append(x)
        nodes_y.append(y)

    df['nodes_x'] = nodes_x
    df['nodes_y'] = nodes_y

    edges_x = []
    edges_y = []
    for e in g1.edges():
        x0, y0 = pos[e[0]]
        x1, y1 = pos[e[1]]
        edges_x.append(x0)
        edges_x.append(x1)
        edges_x.append(None)
        edges_y.append(y0)
        edges_y.append(y1)
        edges_y.append(None)

    # get the custom_data and build the hover template from the DataFrame column names
    custom_data = df.columns
    hover_template = ""
    for i, m in enumerate(custom_data):
        hover_template += "".join(m + ":" + ' %{customdata[' + str(i) + ']} <br> ')

    fig = go.FigureWidget()
    fig.add_trace(go.Scatter(x=edges_x, y=edges_y, name='edges', mode='lines', line={'width': 1}))
    fig.add_trace(go.Scatter(x=df['nodes_x'],
                             y=df['nodes_y'],
                             customdata=df[custom_data],
                             hovertemplate=hover_template,
                             text=df['node_name'],
                             name="Node",
                             mode='markers'))
    fig.update_layout(template="plotly_dark", dragmode='pan',
                      annotations=[{'x': df.iloc[0]['nodes_x'],
                                    'y': df.iloc[0]['nodes_y'],
                                    'axref': 'x',
                                    'ayref': 'y',
                                    'arrowsize': 4,
                                    'arrowcolor': 'red',
                                    'showarrow': True,
                                    'arrowhead': 3}])
    if vistype == 0:
        fig.update_traces(
            marker={'size': ((df['cluster'] * 5) + 10), 'color': df['normalized_rdd'], 'colorscale': 'Jet'})
    elif vistype == 1:
        fig.update_traces(
            marker={'size': 15, 'color': df['cluster'], 'colorscale': 'Jet'})
    else:
        print('enter proper vistype')

    fig.write_html("graph.html", config={'scrollZoom': True})

    return fig
示例#14
0
from rdd import RDD

rdd1 = RDD('./datasets/artists.csv')
rdd2 = RDD('./datasets/albums.csv')


def find_name(line):
    if line[2]:
        return line[2]
    return line[1]


# (artist_id, artist_name)
norwegian_artists = rdd1.map(lambda line: line.split(',')).filter(
    lambda x: x[5] == 'Norway').map(lambda y: (y[0], find_name(y)))

# (artist_id, mtv_review)
albums = rdd2.map(lambda line: line.split(',')).map(lambda x:
                                                    (x[1], float(x[8])))

# (artist_id, (artist_name, mtv_review))
norwegian_albums = norwegian_artists.join(albums)

# (artist_id, (artist_name, avg_critic))
reduced = norwegian_albums.reduceByKey(lambda x, y: (x[0], (x[1] + y[1]) / 2))

# Save as TSV file
reduced.map(lambda x: '{name}\tNorway\t{mtv}'.format(name=x[1][0], mtv=x[1][1])
            ).saveAsTextFile("./datasets/result_9")
示例#15
0
from rdd import RDD

# Make RDD of artists, map year of birth and find min value.
year = RDD("./datasets/artists.csv").map(lambda line: line.split(",")[4]).min()

print("The oldest artist was born in {}.".format(year))
示例#16
0
from rdd import RDD, toTSVLine
from operator import add


# RDD from text file
rdd = RDD('./datasets/albums.csv')

# Create key/value pairs of (genre, tracks sold). List is alreay sorted by id, thus we don't need id
genres = rdd.map(lambda line:  (''.join(line.split(',')[3]), int(line.split(',')[6])))

# Aggregate all genres and sum salesnumbers
sortbysales = genres.reduceByKey(add)

# sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order
completed = sortbysales.sortByKey().sortBy(lambda x: x[1], ascending=False)

# Save to TSV file
lines = completed.map(toTSVLine)
lines.saveAsTextFile('./datasets/result_5')
示例#17
0
from rdd import RDD
from pyspark import SparkContext, SparkConf

# RDD from text file
rdd = RDD('./datasets/albums.csv')

# Create key/value pairs of (album id, average critic)
critics = rdd.map(lambda line: line.split(',')).map(
    lambda x: (x[0], (float(x[7]) + float(x[8]) + float(x[9])) / 3))

# sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order
sortedreview = critics.sortByKey().sortBy(lambda x: x[1], ascending=False)

# Get the 10 best albums based on avg critic
sc = SparkContext.getOrCreate(SparkConf())
top = sc.parallelize(c=sortedreview.take(10))

# Save as TSV file. set coalesce(1) so that we can use this file in Task 7
top.map(lambda x: '{album}\t{avg}'.format(album=x[0], avg=x[1])).coalesce(
    1).saveAsTextFile("./datasets/result_6")
示例#18
0
from rdd import RDD

count = RDD("./datasets/albums.csv").map(lambda line: line.split(",")[3]).distinct().count()
print("There are {} distinct genres in albums.csv.".format(count))