Пример #1
0
            'num_partition_GBK': 2,
            'split_size': 128,
        }
        t = rdd.TextFile(self.filename)
        m = rdd.Map(t, (lambda urls: parseNeighbors(urls)))
        links = rdd.GroupByKey(m)
        ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0))
        for iteration in range(5):
            joins = rdd.Join([links, ranks])
            contribs = rdd.FlatMap(
                joins, lambda url_urls_rank: computeContribs(
                    url_urls_rank[1][0], url_urls_rank[1][1]))
            rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b)
            ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15)
        ranks.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    page_rank_client = PageRankClient(filepath)
    # page_rank_client = PageRankClient(sys.argv[1])
    client = get_client(master_address)
    execute_command(client, client.get_job, pickle_object(page_rank_client),
                    self_address)
    print "[Client]Job Submited...."
    page_rank_client.start_server(self_address)
Пример #2
0
        RDD._config = {'num_partition_RBK': 2,
                   'num_partition_GBK': 2,
                   'split_size': 128,
                   }
        t = rdd.TextFile(self.filename)
        m = rdd.Map(t, (lambda urls: parseNeighbors(urls)))
        links = rdd.GroupByKey(m)
        ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0))
        for iteration in range(5):
            joins = rdd.Join([links, ranks])
            contribs = rdd.FlatMap(joins,
                                   lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))
            rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b)
            ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15)
        ranks.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    page_rank_client = PageRankClient(filepath)
    # page_rank_client = PageRankClient(sys.argv[1])
    client = get_client(master_address)
    execute_command(client, client.get_job, pickle_object(page_rank_client), self_address)
    print "[Client]Job Submited...."
    page_rank_client.start_server(self_address)

Пример #3
0
                       'split_size': 128,
                       "driver_addr": ""}
        RDD._streaming = 20
        lines = rdd.Streaming(driver.num_partition)
        f = rdd.FlatMap(lines, lambda x: parse_lines(x))
        m = rdd.Map(f, lambda x: (x, 1))
        counts = rdd.ReduceByKey(m, lambda a, b: a + b)
        counts.collect(driver)


if __name__ == '__main__':

    name, master_address, self_address, interval = sys.argv
    # word count streaming client
    word_count_client = StreamingWordCountClient(master_address, int(interval))
    obj = pickle_object(word_count_client)

    # assign job
    client = get_client(master_address)
    job_id = execute_command(client, client.get_job, obj, self_address)
    debug_print_by_name('wentao', str(job_id))

    # send data
    send_data_thread = gevent.spawn(send_word, job_id, master_address)
    print "[Client]Job Submited...."
    client_thread = gevent.spawn(word_count_client.start_server, self_address)
    gevent.joinall([send_data_thread, client_thread])

    # word_count_client.start_server(self_address)

Пример #4
0
            'split_size': 128,
            "driver_addr": ""
        }
        RDD._streaming = 20
        lines = rdd.Streaming(driver.num_partition)
        f = rdd.FlatMap(lines, lambda x: parse_lines(x))
        m = rdd.Map(f, lambda x: (x, 1))
        counts = rdd.ReduceByKey(m, lambda a, b: a + b)
        counts.collect(driver)


if __name__ == '__main__':

    name, master_address, self_address, interval = sys.argv
    # word count streaming client
    word_count_client = StreamingWordCountClient(master_address, int(interval))
    obj = pickle_object(word_count_client)

    # assign job
    client = get_client(master_address)
    job_id = execute_command(client, client.get_job, obj, self_address)
    debug_print_by_name('wentao', str(job_id))

    # send data
    send_data_thread = gevent.spawn(send_word, job_id, master_address)
    print "[Client]Job Submited...."
    client_thread = gevent.spawn(word_count_client.start_server, self_address)
    gevent.joinall([send_data_thread, client_thread])

    # word_count_client.start_server(self_address)