def test_append_bias_with_sp_vector(self): data = Vectors.sparse(3, {0: 2.0, 2: 2.0}) expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0}) # Returned value must be SparseVector ret = MLUtils.appendBias(data) self.assertEqual(ret, expected) self.assertEqual(type(ret), SparseVector)
def test_right_number_of_results(self): num_cols = 1001 sparse_data = [ LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) ] chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) self.assertEqual(len(chi), num_cols) self.assertIsNotNone(chi[1000])
def parseEntry(xx): mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1) xx=xx.split('\t') a_virtual=xx[0] browser=xx[1] referrer=xx[2] a_user_key=xx[3] try: birthyear=int(xx[4]) age=2015-birthyear except Exception as _: birthyear=xx[4] age=-1 gender=xx[5] #print(xx) #print(xx[6]) if xx[6]!='NAN': reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d') else: reg_date=mindate device=xx[7] date=datetime.datetime.strptime(xx[8],'%d-%m-%Y') tdiff=datetime.timedelta(hours=int(xx[9])) date=date+tdiff year=date.year month=date.month day=date.day hour=int(xx[9]) weekday=date.weekday() if reg_date>mindate: days_since_registration=(date-reg_date).days else: days_since_registration=-1 metrics=list([int(x.replace(',0','')) for x in xx[10:]]) visits=metrics[0] visits_betalt=metrics[1] pageviews=metrics[2] pageview_nothome=metrics[3] pageview_betalt=metrics[4] timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)]) timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)]) return Row(browser=browser,a_user_key=a_user_key,age=age,\ day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\ pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\ device=device,gender=gender,days_since_registration=days_since_registration,\ reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\ a_virtual=a_virtual)
def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]),), (Vectors.sparse(2, {0:0.3, 1:0.4}),), (Vectors.sparse(2, {0:0.5, 1:0.6}),)] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue(isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0,0], 0.1) self.assertEqual(pd.features[0][0,1], 0.2)
def add_svec(sv1, sv2): assert len(sv1) == len(sv2), "dimension mismatch" indices = [] values = [] i, j = 0, 0 while i < len(sv1.indices) and j < len(sv2.indices): if sv1.indices[i] == sv2.indices[j]: indices.append(sv1.indices[i]) values.append(sv1.values[i] + sv2.values[j]) i += 1 j += 1 elif sv1.indices[i] < sv2.indices[j]: indices.append(sv1.indices[i]) values.append(sv1.values[i]) i += 1 else: indices.append(sv2.indices[j]) values.append(sv2.values[j]) j += 1 while i < len(sv1.indices): indices.append(sv1.indices[i]) values.append(sv1.values[i]) i += 1 while j < len(sv2.indices): indices.append(sv2.indices[j]) values.append(sv2.values[j]) j += 1 return Vectors.sparse(len(sv1), indices, values)
def createSparseVector(histogram): indexList = [] countList = [] for histogramIndex, count in sorted(histogram, key=getKey): indexList.append(histogramIndex) countList.append(count) return Vectors.sparse(2000, indexList,countList)
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def scoreOnePoint(self, x): """ Compute the log likelihood of 'x' being generated under the current model Also returns the probability that 'x' is generated by each component of the mixture Parameters ---------- x : array of shape (1, n_dim) Corresponds to a single data point. Returns ------- log_likelihood_x :Log likelihood of 'x' prob_x : Resposibility of each cluster for the data point 'x' """ lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights)) log_likelihood_x = logsumexp(lpr) prob_x = np.exp(lpr-log_likelihood_x) if self.isSparse == 1: temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :]) sqVec = Vectors.sparse(x.size, x.indices, x.values**2) temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :]) else: temp_wt = np.dot(prob_x.T[:, np.newaxis], x[np.newaxis, :]) temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :]) return log_likelihood_x, prob_x, temp_wt, temp_avg
def test_glr_summary(self): from pyspark.mllib.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def load_cut_to_rdd(input_file, result_file): sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050") lines = sc.textFile(input_file) data = lines.map(parseKV).cache() doc_term_tf = data.reduceByKey(add).cache() num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count() terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect() num_term = len(terms_list) term_idf = doc_term_tf.map( lambda ((tid, term), tf): (term, 1.0) ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1))) tfidf_join = doc_term_tf.map( lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf) tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf))) doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache() nonzero_count = 0 f = open(result_file,'w') f.write('%s %s\r\n'%(num_doc, num_term)) for (tid, feature) in doc_vec.collect(): for num in feature: f.write(str(num)+"\t") f.write("\n") f.close() sc.stop() return
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None): """ Loads labeled data in the LIBSVM format into an RDD of LabeledPoint. The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR. Each line represents a labeled sparse feature vector using the following format: label index1:value1 index2:value2 ... where the indices are one-based and in ascending order. This method parses each line into a LabeledPoint, where the feature indices are converted to zero-based. :param sc: Spark context :param path: file or directory path in any Hadoop-supported file system URI :param numFeatures: number of features, which will be determined from the input data if a nonpositive value is given. This is useful when the dataset is already split into multiple files and you want to load them separately, because some features may not present in certain files, which leads to inconsistent feature dimensions. :param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint >>> tempFile = NamedTemporaryFile(delete=True) >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0") >>> tempFile.flush() >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() >>> tempFile.close() >>> type(examples[0]) == LabeledPoint True >>> print examples[0] (1.0,(6,[0,2,4],[1.0,2.0,3.0])) >>> type(examples[1]) == LabeledPoint True >>> print examples[1] (-1.0,(6,[],[])) >>> type(examples[2]) == LabeledPoint True >>> print examples[2] (-1.0,(6,[1,3,5],[4.0,5.0,6.0])) """ from pyspark.mllib.regression import LabeledPoint if multiclass is not None: warnings.warn("deprecated", DeprecationWarning) lines = sc.textFile(path, minPartitions) parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l)) if numFeatures <= 0: parsed.cache() numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1 return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None): """ Loads labeled data in the LIBSVM format into an RDD of LabeledPoint. The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR. Each line represents a labeled sparse feature vector using the following format: label index1:value1 index2:value2 ... where the indices are one-based and in ascending order. This method parses each line into a LabeledPoint, where the feature indices are converted to zero-based. :param sc: Spark context :param path: file or directory path in any Hadoop-supported file system URI :param numFeatures: number of features, which will be determined from the input data if a nonpositive value is given. This is useful when the dataset is already split into multiple files and you want to load them separately, because some features may not present in certain files, which leads to inconsistent feature dimensions. :param minPartitions: min number of partitions @return: labeled data stored as an RDD of LabeledPoint >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint >>> tempFile = NamedTemporaryFile(delete=True) >>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0") >>> tempFile.flush() >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() >>> tempFile.close() >>> examples[0] LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0])) >>> examples[1] LabeledPoint(-1.0, (6,[],[])) >>> examples[2] LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0])) """ from pyspark.mllib.regression import LabeledPoint if multiclass is not None: warnings.warn("deprecated", DeprecationWarning) lines = sc.textFile(path, minPartitions) parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l)) if numFeatures <= 0: parsed.cache() numFeatures = parsed.map( lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1 return parsed.map(lambda x: LabeledPoint( x[0], Vectors.sparse(numFeatures, x[1], x[2])))
def termFreqsMapper(termFreqs): docTotalTerms = sum(termFreqs.values()) termFreqsFiltered = filter(lambda tf: tf[0] in bIdTerms, termFreqs.items()) termScores = map( lambda tf: (bIdTerms[tf[0]], bIdfs[tf[0]] * termFreqs[tf[0]] / docTotalTerms), termFreqsFiltered) return Vectors.sparse(len(bIdTerms), termScores)
def test_model_transform(self): weight = Vectors.dense([3, 2, 1]) densevec = Vectors.dense([4, 5, 6]) sparsevec = Vectors.sparse(3, [0], [1]) eprod = ElementwiseProduct(weight) self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6])) self.assertEqual( eprod.transform(sparsevec), SparseVector(3, [0], [3]))
def _get_data(self): sql_context = SQLContext(self.sc) l = [ ( "I dont know why people think this is such a bad movie.", Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0}) ), ] return sql_context.createDataFrame(l, ['text', 'features'])
def mkFeatureVector(idxSizeArr): tempSize = 0 featureArr = [] valueArr = [] for i in idxSizeArr: featureArr.append(i[0] + tempSize) valueArr.append(1) tempSize += i[1] return Vectors.sparse(tempSize, featureArr, valueArr)
def vectorDFtoIndexedMatrix(df, vecvar, idcol): ''' applicable to dataframe already having assembled vectors ''' df = df.rdd.map(lambda row: IndexedRow( row[idcol], MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar]. values))) return IndexedRowMatrix(df)
def vectorize(fc): if "size" not in fc and "type" not in fc: sv = fc elif "size" not in fc and "type" in fc and fc["type"] == 1: sv = fc["values"] else: sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"]))).toArray() return sv
def score_and_peptide(peptide, query_peaks_bc): """ Given a peptide and a query, perform a dot product """ # get max vector size based on bins peptide_mz_bins = [peak[0] for peak in peptide[1]] query_mz_bins = [peak[0] for peak in query_peaks_bc.value] max_bin_peptide = max(peptide_mz_bins) + 1 max_bin_query = max(query_mz_bins) + 1 max_size = max(max_bin_peptide, max_bin_query) # Create SparseVector for peptide peptide_sv = Vectors.sparse(max_size, peptide[1]) # Create a SparseVector the query query_sv = Vectors.sparse(max_size, query_peaks_bc.value) # return peptide and dot product result return (peptide[0], peptide_sv.dot(query_sv))
def _pre_dot(self, A): size = self.size a = A.entries.map( lambda entry: (entry.j, (entry.i, entry.value)) ).groupByKey() \ .map( lambda x: (x[0], Vectors.sparse(size, *list(zip(*sorted(x[1].data, key = lambda x: x[0]))))) ) return a
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "prediction"])
def score_and_peptide(peptide, query_peaks_bc): """ Given a peptide and a query, perform a dot product """ # get max vector size based on bins peptide_mz_bins = [peak[0] for peak in peptide[1]] query_mz_bins = [peak[0] for peak in query_peaks_bc.value] max_bin_peptide = max(peptide_mz_bins)+1 max_bin_query = max(query_mz_bins)+1 max_size = max(max_bin_peptide,max_bin_query) # Create SparseVector for peptide peptide_sv = Vectors.sparse(max_size, peptide[1]) # Create a SparseVector the query query_sv = Vectors.sparse(max_size, query_peaks_bc.value) # return peptide and dot product result return (peptide[0], peptide_sv.dot(query_sv))
def document_vector(document): id = document[1] counts = defaultdict(int) for token in document[0]: if token in vocabulary: token_id = vocabulary[token] counts[token_id] += 1 counts = sorted(counts.items()) keys = [x[0] for x in counts] values = [x[1] for x in counts] return (id, Vectors.sparse(len(vocabulary), keys, values))
def create_sparse_vector(row): term = row[0] if type(row[1][0]) is int: dictionary = {} dictionary[row[1][0]] = row[1][1] else: documents = row[1] dictionary = {} for document in documents: dictionary[document[0]] = document[1] return Vectors.sparse(N, dictionary)
def dividev(v1, n): indices = set(v1.indices) v1d = dict(zip(v1.indices, v1.values)) # print(v1d) zero = np.float64(0) # print(zero) values = { i: v1d.get(i, zero) / n for i in indices if v1d.get(i, zero) / n != zero } return Vectors.sparse(v1.size, values)
def sparse_vector_mul(v1, v2): if not (is_none_or_instance(v1, SparseVector) and is_none_or_instance(v2, SparseVector)): raise TypeError('v1 and v2 are not SparseVectors') if v1.size != v2.size: raise ValueError('v1 and v2 are not of same size') d1 = dict(zip(v1.indices, v1.values)) d2 = dict(zip(v2.indices, v2.values)) indices = sorted(list(set(v1.indices) & set(v2.indices))) values = [d1[i] * d2[i] for i in indices] return Vectors.sparse(v1.size, indices, values)
def test_binary_term_freqs(self): hashingTF = HashingTF(100).setBinary(True) doc = "a a b c c c".split(" ") n = hashingTF.numFeatures output = hashingTF.transform(doc).toArray() expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0, hashingTF.indexOf("b"): 1.0, hashingTF.indexOf("c"): 1.0}).toArray() for i in range(0, n): self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(output[i]))
def sparse_vector_add(v1, v2): if not (is_none_or_instance(v1, SparseVector) and is_none_or_instance(v2, SparseVector)): raise TypeError('v1 and v2 are not SparseVectors') if v1.size != v2.size: raise ValueError('v1 and v2 are not of same size') d1 = dict(zip(v1.indices, v1.values)) d2 = dict(zip(v2.indices, v2.values)) zero = NP.float64(0) indices = sorted(list(set(v1.indices) | set(v2.indices))) values = [d1.get(i, zero) + d2.get(i, zero) for i in indices] return Vectors.sparse(v1.size, indices, values)
def ztest_toPandas(self): data = [(Vectors.dense([0.1, 0.2]), ), (Vectors.sparse(2, { 0: 0.3, 1: 0.4 }), ), (Vectors.sparse(2, { 0: 0.5, 1: 0.6 }), )] df = self.sql.createDataFrame(data, ["features"]) self.assertEqual(df.count(), 3) pd = self.converter.toPandas(df) self.assertEqual(len(pd), 3) self.assertTrue( isinstance(pd.features[0], csr_matrix), "Expected pd.features[0] to be csr_matrix but found: %s" % type(pd.features[0])) self.assertEqual(pd.features[0].shape[0], 3) self.assertEqual(pd.features[0].shape[1], 2) self.assertEqual(pd.features[0][0, 0], 0.1) self.assertEqual(pd.features[0][0, 1], 0.2)
def load_sparse_data(): tempDataLocalPath = mlsql.internal_system_param["tempDataLocalPath"] # train the model on the new data for a few epochs datafiles = [ file for file in os.listdir(tempDataLocalPath) if file.endswith(".json") ] row_n = [] col_n = [] data_n = [] y = [] feature_size = 0 row_index = 0 for file in datafiles: with open(tempDataLocalPath + "/" + file) as f: for line in f.readlines(): obj = json.loads(line) fc = obj[featureCol] if "size" not in fc and "type" not in fc: feature_size = len(fc) dic = [(i, a) for i, a in enumerate(fc)] sv = SparseVector(len(fc), dic) elif "size" not in fc and "type" in fc and fc["type"] == 1: values = fc["values"] feature_size = len(values) dic = [(i, a) for i, a in enumerate(values)] sv = SparseVector(len(values), dic) else: feature_size = fc["size"] sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"]))) for c in sv.indices: row_n.append(row_index) col_n.append(c) data_n.append(sv.values[list(sv.indices).index(c)]) if type(obj[labelCol]) is list: y.append(np.array(obj[labelCol]).argmax()) else: y.append(obj[labelCol]) row_index += 1 if row_index % 10000 == 0: print("processing lines: %s, values: %s" % (str(row_index), str(len(row_n)))) # sys.stdout.flush() print("X matrix : %s %s row_n:%s col_n:%s classNum:%s" % (row_index, feature_size, len(row_n), len(col_n), ",".join( [str(i) for i in list(set(y))]))) sys.stdout.flush() return sp.csc_matrix((data_n, (row_n, col_n)), shape=(row_index, feature_size)), y
def get_sparseVector(x): ids=[] for j in x: if j in cluster.keys(): ids.append(cluster[j]) bag_words = {} for i in ids: bag_words[i]=(float(ids.count(i))/len(ids)) # Create a SparseVector sv = Vectors.sparse(2000, bag_words) return sv
def transform(self, document): """ Transforms the input document (list of terms) to term frequency vectors, or transform the RDD of document to RDD of term frequency vectors. """ if isinstance(document, RDD): return document.map(self.transform) freq = {} for term in document: i = self.indexOf(term) freq[i] = freq.get(i, 0) + 1.0 return Vectors.sparse(self.numFeatures, freq.items())
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def add(v1, v2): assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector) assert v1.size == v2.size indices = set(v1.indices).union(set(v2.indices)) v1d = dict(zip(v1.indices, v1.values)) v2d = dict(zip(v2.indices, v2.values)) zero = np.float64(0) values = { i: v1d.get(i, zero) + v2d.get(i, zero) for i in indices if v1d.get(i, zero) + v2d.get(i, zero) != zero } return Vectors.sparse(v1.size, values)
def make_dataFrame(my_dict1, my_dict2, most_list): fre_keys = my_dict1.keys() index_list = list() for i in range(len(most_list)): for j in range(len(fre_keys)): if most_list[i] == fre_keys[j]: index_list.append(i) fre_keys2 = my_dict2.keys() index_list2 = list() for i in range(len(most_list)): for j in range(len(fre_keys2)): if most_list[i] == fre_keys2[j]: index_list2.append(i) #print index_list #print index_list2 from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors tmp_dict = dict() for i in range(len(index_list)): tmp_dict[index_list[i]] = my_dict1[most_list[index_list[i]]] tmp_dict2 = dict() for i in range(len(index_list2)): tmp_dict2[index_list2[i]] = my_dict2[most_list[index_list2[i]]] p = [ LabeledPoint(1, Vectors.sparse(20, tmp_dict )), LabeledPoint(0, Vectors.sparse(20, tmp_dict2 )) ] trainDf = spark.createDataFrame(p) trainDf.show()
def test_ml_mllib_vector_conversion(self): # to ml # dense mllibDV = Vectors.dense([1, 2, 3]) mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) mlDV2 = mllibDV.asML() self.assertEqual(mlDV2, mlDV1) # sparse mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) mlSV2 = mllibSV.asML() self.assertEqual(mlSV2, mlSV1) # from ml # dense mllibDV1 = Vectors.dense([1, 2, 3]) mlDV = newlinalg.Vectors.dense([1, 2, 3]) mllibDV2 = Vectors.fromML(mlDV) self.assertEqual(mllibDV1, mllibDV2) # sparse mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) mllibSV2 = Vectors.fromML(mlSV) self.assertEqual(mllibSV1, mllibSV2)
def convertToSparse(a): itemNo=[] item=[] j=0 for i in a: j=j+1 if j%2==0: item.append(int(i)) else: itemNo.append(int(i)+7) return Vectors.sparse(43,itemNo,item)
def _dot1(self, S): if self.size != S.numRows(): raise Exception( f"size mismatch ({self.size},) and ({S.numRows()},{S.numCols()})" ) size = self.size v = self.rdd.map(lambda entry: (1, entry)).groupByKey().map(lambda x: ( x[0], Vectors.sparse( size, *list(zip(*sorted(x[1].data, key=lambda y: y[0])))))) a = self._pre_dot(S) c = v.cartesian(a).map(lambda x: (x[1][0], float(x[0][1].dot(x[1][1]))) ).filter(lambda entry: entry[1] != 0.0) return SparseDistributedVector(c, S.numCols())
def sparseVectorTimesMatrixAlloc(sparseVector, matrix): matrixCols = matrix.shape[1] tupleList = {} for col in range(matrixCols): indices = sparseVector.indices dotRes = 0 for index in indices: value = sparseVector[index] dotRes += matrix[index, col] * value if ( abs(dotRes) > PCAUtils.zero): tupleList[col] = dotRes # alloc space for the sparse vector sparseRet = Vectors.sparse(matrixCols, tupleList) return sparseRet
def DFtoIndexedMatrix(df, quantvars, idcol): ''' convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors ''' df = VectorAssembler( inputCols=quantvars, outputCol="features" ).transform(df).select( [idcol, "features"] ) #vector assembler turn it automatically to sparse matrix, so next line should be fine df = df.rdd.map(lambda row: IndexedRow( row[idcol], MLLibVectors.sparse(row.features.size, row.features.indices, row. features.values))) return IndexedRowMatrix(df)
def test_linear_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue( isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertAlmostEqual(s.explainedVariance, 0.25, 2) self.assertAlmostEqual(s.meanAbsoluteError, 0.0) self.assertAlmostEqual(s.meanSquaredError, 0.0) self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) self.assertAlmostEqual(s.r2, 1.0, 2) self.assertTrue(isinstance(s.residuals, DataFrame)) self.assertEqual(s.numInstances, 2) devResiduals = s.devianceResiduals self.assertTrue( isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) coefStdErr = s.coefficientStandardErrors self.assertTrue( isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue( isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue( isinstance(pValues, list) and isinstance(pValues[0], float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
def get_sparse_vectors(documents): # get unique word list WordList = const_unique_word_list(documents) # convert to counter object documents_word_counter = documents.map(lambda words: sorted( Counter([WordList.index(word) for word in words if word in WordList]).items(), key=lambda pair: pair[0], reverse=False)) # convert to sparse vector documents_sparse_vectors = documents_word_counter.map( lambda counter: Vectors.sparse(len(WordList), tuple([pair[0] for pair in counter]), tuple([pair[1] for pair in counter]))) return documents_sparse_vectors
def docTopics(filepath, topicMatrix): # for each topic # sum probablity of words in corpus # normalize so that probility of topics sum to 1 n_vcb = topicMatrix.shape[0] data = sc.textFile(filepath) parsedData = data.map(lambda line: line.strip().split(' ')).map( lambda x: (int(x[0]) - 1, (int(x[1]) - 1, float(x[2])))).groupByKey().mapValues(list) corpus = parsedData.map( lambda x: [x[0], normalize(Vectors.sparse(n_vcb, x[1]).dot(topicMatrix))]) return corpus.collect()
def _dot2(self,v): if self.numCols() != v.size: raise Exception(f"size mismatch ({self.numRows()},{self.numCols()}) and ({v.size},)") size = v.size sv = v.rdd.map(lambda entry: (1, entry)).groupByKey().map( lambda x: (x[0], Vectors.sparse(size, *list(zip(*sorted(x[1].data, key = lambda y: y[0]))))) ) a = self._pre_dot(self, size = size) c = sv.cartesian(a).map( lambda x: (x[1][0], float(x[0][1].dot(x[1][1]))) ).filter( lambda entry: entry[1] != 0.0 ) return sdv.SparseDistributedVector(c, self.numRows())
def test_apply_binary_term_freqs(self): sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) n = 100 hashingTF = HashingTF() hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) output = hashingTF.transform(df) features = output.select("features").first().features.toArray() expected = Vectors.sparse(n, {(ord("a") % n): 1.0, (ord("b") % n): 1.0, (ord("c") % n): 1.0}).toArray() for i in range(0, n): self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(features[i]))
def add(v1, v2): """Add two sparse vectors >>> v1 = Vectors.sparse(3, {0: 1.0, 2: 1.0}) >>> v2 = Vectors.sparse(3, {1: 1.0}) >>> add(v1, v2) SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}) """ assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector) assert v1.size == v2.size indices = set(v1.indices).union(set(v2.indices)) v1d = dict(zip(v1.indices, v1.values)) v2d = dict(zip(v2.indices, v2.values)) zero = np.float64(0) values = {i: v1d.get(i, zero) + v2d.get(i, zero) for i in indices if v1d.get(i, zero) + v2d.get(i, zero) != zero} return Vectors.sparse(v1.size, values)
def load_sparse_data(): tempDataLocalPath = mlsql.internal_system_param["tempDataLocalPath"] # train the model on the new data for a few epochs datafiles = [file for file in os.listdir(tempDataLocalPath) if file.endswith(".json")] row_n = [] col_n = [] data_n = [] y = [] feature_size = 0 row_index = 0 for file in datafiles: with open(tempDataLocalPath + "/" + file) as f: for line in f.readlines(): obj = json.loads(line) fc = obj[featureCol] if "size" not in fc and "type" not in fc: feature_size = len(fc) dic = [(i, a) for i, a in enumerate(fc)] sv = SparseVector(len(fc), dic) elif "size" not in fc and "type" in fc and fc["type"] == 1: values = fc["values"] feature_size = len(values) dic = [(i, a) for i, a in enumerate(values)] sv = SparseVector(len(values), dic) else: feature_size = fc["size"] sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"]))) for c in sv.indices: row_n.append(row_index) col_n.append(c) data_n.append(sv.values[list(sv.indices).index(c)]) if type(obj[labelCol]) is list: y.append(np.array(obj[labelCol]).argmax()) else: y.append(obj[labelCol]) row_index += 1 if row_index % 10000 == 0: print("processing lines: %s, values: %s" % (str(row_index), str(len(row_n)))) # sys.stdout.flush() print("X matrix : %s %s row_n:%s col_n:%s classNum:%s" % ( row_index, feature_size, len(row_n), len(col_n), ",".join([str(i) for i in list(set(y))]))) sys.stdout.flush() return sp.csc_matrix((data_n, (row_n, col_n)), shape=(row_index, feature_size)), y
def get_top_movies_for_user(user_ratings=fake_user_ratings, ratings_rdd=ratingsRDD): movies_length = ratings_rdd.map(lambda x: x[1]).max() + 1 user_ids_with_ratings_rdd = (ratings_rdd .map(lambda (user_id, movie_id, rating): (user_id, [(movie_id, rating)])) .reduceByKey(lambda a, b: a + b) .filter(lambda x: len(x[1]) > 25) .map(lambda x: (x[0], Vectors.sparse(movies_length, x[1])))) user_seen_movies_list = [x[0] for x in user_ratings] most_similar_for_user_rdd = create_most_similar_for_user_rdd(user_ratings, user_ids_with_ratings_rdd, movies_length) similar_users_and_similarity_rdd = create_similar_users_and_similarity_rdd(most_similar_for_user_rdd) top_movies_for_user = (user_ids_with_ratings_rdd .join(similar_users_and_similarity_rdd) .flatMap(lambda x: create_id_rating_tuples(x[1][1], x[1][0])) .filter(lambda x: x[0] not in user_seen_movies_list) .reduceByKey(lambda a, b: max(a, b)) .takeOrdered(100, lambda x: -x[1])) return top_movies_for_user
def add_hashed_features(self,df,num_hash_buckets = 2 ** 15): def hash_function(raw_feats, num_buckets, print_mapping=False): """Calculate a feature dictionary for an observation's features based on hashing. Note: Use print_mapping=True for debug purposes and to better understand how the hashing works. Args: raw_feats (list of (int, str)): A list of features for an observation. Represented as (featureID, value) tuples. num_buckets (int): Number of buckets to use as features. print_mapping (bool, optional): If true, the mappings of featureString to index will be printed. Returns: dict of int to float: The keys will be integers which represent the buckets that the features have been hashed to. The value for a given key will contain the count of the (featureID, value) tuples that have hashed to that key. """ mapping = {category + ':' + str(ind): int(int(hashlib.md5(category + ':' + str(ind)).hexdigest(), 16) % num_buckets) for ind, category in raw_feats} if (print_mapping): print mapping def map_update(l, r): l[r] += 1.0 return l sparse_features = reduce(map_update, mapping.values(), defaultdict(float)) return dict(sparse_features) """Return a DataFrame with labels and hashed features. Note: Make sure to cache the DataFrame that you are returning. Args: df (DataFrame with 'tuples' column): A DataFrame containing the tuples to be hashed. Returns: DataFrame: A DataFrame with a 'label' column and a 'features' column that contains a SparseVector of hashed features. """ tuples_to_hash_features_udf = udf(lambda x: Vectors.sparse(num_hash_buckets, hash_function(x, num_hash_buckets)), VectorUDT()) return df.select(df.label,tuples_to_hash_features_udf(df.features).alias("features")).cache()
def change_to_sparse(line): keys = [] values = [] #logger = logging.getLogger("py4j") #logger.setLevel(logging.INFO) #logger.addHandler(logging.StreamHandler()) #logger.info("<><<><><><><><><>") temp = line.split("{")[1] temp = re.sub('}','',temp) for item in temp.split(',')[1:]: index = int(item.split(':')[0]) value = float(item.split(':')[1]) keys.append(index) values.append(value) #print keys #print values #logger.info(max(keys)) return Vectors.sparse(Vocab_size, sorted(keys),values)
def log_multivariate_normal_density_diag_Nd(self, x): """ Compute Gaussian log-density at x for a diagonal model """ n_features = x.size if self.isSparse == 1: t = Vectors.sparse(x.size, x.indices, x.values**2).dot((1/self.covarBc.value).T) else: t = np.dot(x**2, (1/self.covarBc.value).T) lpr = -0.5 * (n_features*np.log(2*np.pi) + np.sum(np.log(self.covarBc.value), 1) + np.sum((self.meansBc.value ** 2) / self.covarBc.value, 1) - 2 * x.dot((self.meansBc.value/self.covarBc.value).T) + t) return lpr
def test_save_load(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol()) self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol()) self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) for m, n in zip(model.models, loadedModel.models): self.assertEqual(m.uid, n.uid)