data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt') data = data.map(lambda _ : np.array(_.strip().split()).astype(float)) data = data.map(lambda _ : _/np.linalg.norm(_)) U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0])) U = IndexedRowMatrix(U) UT = U.toCoordinateMatrix() UT = UT.transpose() U = U.toBlockMatrix() UT = UT.toBlockMatrix() S = U.multiply(UT) S_coord = S.toCoordinateMatrix() sim = S_coord.entries print(sim.take(100)) debug.TIMESTAMP(2)
# Open some context to allow for toDF function to work or something ?? sql.SQLContext(sc) data = sc.textFile(dataset) #data = (data.map(lambda s: (list(map(lambda x: float(x), s.split()))))).zipWithIndex().map(lambda x: ((x[1], 0), DenseMatrix(1, 1000, x[0]))) # Read matrix normally to format of (rownumber, vector) data = data.map(lambda s: (list(map(lambda x: float(x), s.split()))) ).zipWithIndex().map(lambda x: (x[1], x[0])) # Create a transpose for the matrix tdata = sc.textFile(dataset).map(lambda s: list( map(lambda x: (x[0], float(x[1])), enumerate(s.split())))).zipWithIndex().flatMap( lambda x: map(lambda y: (y[0], (x[1], y[1])), x[0])).groupByKey() # Map the transpose data to same format as normal matrix tdata = tdata.map(lambda x: (x[ 0], map(lambda s: s[1], sorted(list(x[1]), key=itemgetter(0))))) # Create BlockMatrix for the normal matrix and its transpose mat = IndexedRowMatrix(data) mat = mat.toBlockMatrix() matTranspose = IndexedRowMatrix(tdata).toBlockMatrix() # Get final result by multiplying mat * mat^T * mat matTranspose = mat.multiply(matTranspose) matRes = matTranspose.multiply(mat) print('Done')
def get_Total_Related_Downloads(self, dfmain): #total downloads download_count = dfmain.groupby(['_id'])['_id'].agg(['count']) #build datasets vs ip similarity matrix group = pd.DataFrame({ 'download_count': dfmain.groupby(['_id', 'ip']).size() }).reset_index() person_u = list(group.ip.unique()) dataset_u = list(group._id.unique()) outF = open(self.DATA_LIST_FILE, "w") for line in dataset_u: outF.write(str(line)) outF.write("\n") outF.close() data = group['download_count'].tolist() row = group._id.astype('category', categories=dataset_u).cat.codes cols = group.ip.astype('category', categories=person_u).cat.codes len_dataset = len(dataset_u) len_person = len(person_u) print("Datasets vs Ips :", str(len_dataset), str(len_person)) #(309235, 81566) sparsemat = sparse.csr_matrix((data, (row, cols)), dtype=np.int8, shape=(len_dataset, len_person)) m, n = sparsemat.shape def f(x): d = {} for i in range(len(x)): d[str(i)] = float(x[i]) return d # load PySpark using findSpark package #SparkContext.setSystemProperty('spark.executor.memory', '5g') #SparkContext.setSystemProperty('spark.driver.memory', '5g') #SparkContext.setSystemProperty('spark.executor.heartbeatInterval', '1000000000s') #conf = SparkConf().setAppName("simdownload") #conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G'))#.set('spark.executor.heartbeatInterval','1000000s') #sc = SparkContext(conf=conf) #sc = SparkContext("local", "simdownload") sc = SparkContext(appName="simdownload") sqlContext = SQLContext(sc) #print(sc._conf.getAll()) sv_rdd = sc.parallelize(sparsemat.toarray()) #populate the values from rdd to dataframe dfspark = sv_rdd.map(lambda x: Row(**f(x))).toDF() row_with_index = Row(*["id"] + dfspark.columns) def make_row(columns): def _make_row(row, uid): row_dict = row.asDict() return row_with_index(*[uid] + [row_dict.get(c) for c in columns]) return _make_row print('parallelize-ok') f = make_row(dfspark.columns) # create a new dataframe with id column (use indexes) dfidx = (dfspark.rdd.zipWithIndex().map(lambda x: f(*x)).toDF( StructType([StructField("id", LongType(), False)] + dfspark.schema.fields))) #compute cosine sim by rows pred = IndexedRowMatrix( dfidx.rdd.map(lambda row: IndexedRow(row.id, row[1:]))) pred1 = pred.toBlockMatrix().transpose().toIndexedRowMatrix() pred_sims = pred1.columnSimilarities() #convert coordinatematrix (pred_sims) into a dataframe columns = ['from', 'to', 'sim'] vals = pred_sims.entries.map(lambda e: (e.i, e.j, e.value)) dfsim = sqlContext.createDataFrame(vals, columns) print('Sim Done!') print('Time Sim Done: ' + time.strftime("%H:%M:%S")) json_data = {} for i in range(m): target_id = int(dataset_u[i]) dftemp = dfsim.where((psf.col("from") == i) | (psf.col("to") == i)).sort( psf.desc("sim")).limit( self.num_top_dataset) df = dftemp.toPandas() # v = df.iloc[:, :-1].values # ii = np.arange(len(df))[:, None] # ji = np.argsort(v == i, axis=1) # replace `1` with your ID # related_ids = (v[ii, ji][:, 0]).tolist() # related_datasets = [dataset_u[i] for i in related_ids] myarr = [] for index, rw in df.iterrows( ): #this is a bit faster than numpy above from_id = rw['from'] to_id = rw['to'] if (from_id != i): myarr.append(int(from_id)) if (to_id != i): myarr.append(int(to_id)) related_datasets = [int(dataset_u[i]) for i in myarr] downloads = download_count.loc[target_id]['count'] data = {} data['related_datasets'] = related_datasets data['total_downloads'] = int(downloads) json_data[target_id] = data print('Time JSONUSAGE_FILE 1: ' + time.strftime("%H:%M:%S")) with open(self.JSONUSAGE_FILE, 'w') as fp: json.dump(json_data, fp) print('Time JSONUSAGE_FILE 2: ' + time.strftime("%H:%M:%S")) sc.stop()
A = A.map(lambda s : [float(x) for x in s.split()]) # Zip index values with cell values A = A.zipWithIndex().map(lambda x: (x[1], x[0])) # Print step 1 ready. With full set 1min. print(" ") print("Step 1 ready") print(" ") # Conver A to IndexedRowMatrix A = IndexedRowMatrix(A) # Convert A to blockmatrices and set block size. #A = A.toBlockMatrix(1000, 1000) # Works with sample set. Data to 1 block. A = A.toBlockMatrix(100, 1000) # Testing with full dataset # Cache A, because it is used multiple times A.cache() # Print step 2 ready. With full set 3mins. print(" ") print("Step 2 ready") print(" ") # Next multiplications. We need to calculate A*AT*A. # Size of the A is 1000000 x 1000 so size of the A*AT would be 100000*100000. # and AT*A would be 1000*1000. For better performance use matrix multiplication # rule (A*AT)*A = A*(AT*A). # Calculate A transpose