def main(): datasetfile = sys.argv[1] beta = 0.8 iterations = 40 top_k = 5 sparkcontext = SparkContext("local", "Page Rank") data = sparkcontext.textFile(datasetfile) source_dest = data.map(make_key_value_pair_1) source_dest_count = data.map(make_key_value_pair_2) groupbykey = source_dest.groupByKey() number_of_nodes = groupbykey.count() out_degree = groupbykey.map(calc_out_degree) pair_map = groupbykey.collectAsMap() matrix_m = np.zeros(shape=(number_of_nodes, number_of_nodes)) for key, value in pair_map.items(): for ind_value in value: matrix_m[ind_value - 1][key - 1] += 1 / len(list(value)) matrix_m = sparkcontext.parallelize(matrix_m) matrix_m = RowMatrix(matrix_m) vector_r_prev = np.empty([number_of_nodes, 1]) vector_r_prev.fill(1 / number_of_nodes) vector_r_prev = DenseMatrix(number_of_nodes, 1, vector_r_prev) index = 0 while (index < iterations): mul_val = matrix_m.multiply(vector_r_prev).rows.collect() mul_val = [i * beta for i in mul_val] mul_val = [i + (1 - beta) / number_of_nodes for i in mul_val] vector_r_prev = DenseMatrix(number_of_nodes, 1, mul_val) index += 1 vector_r_prev = vector_r_prev.toArray() largest_values = heapq.nlargest(top_k, vector_r_prev) largest_indexes = heapq.nlargest(top_k, range(number_of_nodes), vector_r_prev.__getitem__) smallest_values = heapq.nsmallest(top_k, vector_r_prev) smallest_indexes = heapq.nsmallest(top_k, range(number_of_nodes), vector_r_prev.__getitem__) largest_indexes = [val + 1 for val in largest_indexes] smallest_indexes = [val + 1 for val in smallest_indexes] print("Value of largest n nodes\n", largest_values) print("Node numbers of largest n nodes\n", largest_indexes) print("Value of smallest n nodes\n", smallest_values) print("Node numbers of smallest n nodes\n", smallest_indexes) sparkcontext.stop()
def test_dense_matrix_is_transposed(self): mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True) mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9]) self.assertEqual(mat1, mat) expected = [[0, 4], [1, 6], [3, 9]] for i in range(3): for j in range(2): self.assertEqual(mat1[i, j], expected[i][j]) self.assertTrue(array_equal(mat1.toArray(), expected)) sm = mat1.toSparse() self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2])) self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
#Embeddings a = model.itemFactors b = a.sort("id") b.show() #Creating a dense matrix from embedding for businesses values = (b.rdd.map(lambda x: (x.id, x.features)).sortByKey().flatMap( lambda (x, y): y).collect()) nrow = len(b.rdd.map(lambda x: x.features).first()) ncol = b.count() dm = DenseMatrix(nrow, ncol, values) dm.toArray().shape z = dm.toArray().transpose() #t-sne tsne = TSNE(n_components=2) X_tsne = tsne.fit_transform(z) # creating data frame with t-sne results and business_id e = sqlContext.createDataFrame(pd.DataFrame(X_tsne)) e_df = e.toPandas() j = b.select("id") j_df = j.toPandas() result = pd.concat([e_df, j_df], axis=1, ignore_index=True) result = pd.DataFrame(result)