def spark_pearson(a, b): rdd_a = sc.parallelize(a) rdd_b = sc.parallelize(b) g = func.func_globals g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson') g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman') func(a, b)
def CorrelationFeature(vectors): matriz = sc.broadcast(Statistics.corr(vectors, method="pearson")) summary = Statistics.colStats(vectors) varianza = summary.variance() #########new heuristic diogo proposal w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = varianza[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) #features sorted index = [] for i in r: index.append(i[1]) index = index[0:6] #tacking the first 6 features return index
def _transform(self, df): for k, v in df.schema[ self.inputCol].metadata["ml_attr"]["attrs"].items(): features_df = pd.DataFrame(v) column_names = list(features_df['name']) df_vector = df.rdd.map(lambda x: x[self.inputCol].toArray()) #self.correlation_type is class parameter matrix = Statistics.corr(df_vector, method=self.correlation_type) # apply pandas dataframe operation on the fit output corr_df = pd.DataFrame(matrix, columns=column_names, index=column_names) final_corr_df = pd.DataFrame(corr_df.abs().unstack().sort_values( kind='quicksort')).reset_index() final_corr_df.rename( { 'level_0': 'col1', 'level_1': 'col2', 0: 'correlation_value' }, axis=1, inplace=True) final_corr_df = final_corr_df[ final_corr_df['col1'] != final_corr_df['col2']] #shortlisted dataframe based on custom cutoff shortlisted_corr_df = final_corr_df[ final_corr_df['correlation_value'] > self.correlation_cutoff] return corr_df, shortlisted_corr_df
def compute_correlation_matrix(df, method='pearson'): columns=[item[0] for item in df.dtypes if (item[1].startswith('float') or item[1].startswith('double'))]#need to work according to the datatypes df_filter=df.select(columns) df_rdd = df_filter.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(df_rdd, method=method) corr_mat_df = pd.DataFrame(corr_mat,columns=df_filter.columns,index=df_filter.columns) return corr_mat_df
def compute_correlation_matrix(df, method='pearson'): # wrapper around # https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html df_rdd = df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(df_rdd, method=method) corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns) return corr_mat_df
def calculateCorrelation(rdd1, rdd2): joined_rdd = rdd1.join(rdd2).sortByKey() rdd1_values = joined_rdd.map(lambda x:x[1][0]) rdd2_values = joined_rdd.map(lambda x:x[1][1]) correlation_value = Statistics.corr(rdd1_values, rdd2_values) return (joined_rdd,correlation_value)
def compute_correlation_matrix(df, method='pearson'): df_rdd = df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(df_rdd, method=method) corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns) return corr_mat_df
def correlationTemperatureHardness(df,spark): column1 = df.select('temperature').rdd.map(lambda x: x['temperature']).filter(lambda x: x is not None).filter(lambda x: x != '') column2 = df.select('hardness').rdd.map(lambda x: x['hardness']).filter(lambda x: x is not None).filter(lambda x: x != '') data = column1.zip(column2) corr_matrix = Statistics.corr(data) return corr_matrix[1][0]
def compute_correlation_matrix(df,method='spearman'): churn_data3_rdd = df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(churn_data3_rdd, method=method) corr_mat_churn_data3 = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns) return corr_mat_churn_data3
def estimate_correlation_matrix(df, cols, method='pearson', round_decimals=3): features = df.select(cols).rdd.map(lambda row: row[0:]) corr_mat= pd.DataFrame( Statistics.corr(features, method=method), columns=cols, index=cols) \ .round(round_decimals) \ .style \ .background_gradient(cmap='coolwarm') return corr_mat
def correlations(sdf, colnames, method='pearson', ax=None, plot=True): sdf = sdf.notHandy() correlations = Statistics.corr(sdf.select(colnames).dropna().rdd.map(lambda row: row[0:]), method=method) pdf = pd.DataFrame(correlations, columns=colnames, index=colnames) if plot: if ax is None: fig, ax = plt.subplots(1, 1) return sns.heatmap(round(pdf,2), annot=True, cmap="coolwarm", fmt='.2f', linewidths=.05, ax=ax) else: return pdf
def main(): ###Loading data from sources print 'before preprocess' data = [preprocess(input_file)] print 'after preprocess' #get spark context sc = getSparkContext() print 'before parallelize' data = np.hstack((data[0]['train_data'], data[0]['train_labels'].reshape( (data[0]['train_labels'].shape[0], 1)))) data = [ Vectors.dense(list(data[row, :])) for row in range(0, data.shape[0]) ] samples = sc.parallelize(data) #samples.persist() pearsonCorr = Statistics.corr(samples) print str(pearsonCorr).replace('nan', 'NaN') sys.exit() print Statistics.corr(data, method="pearson")
def DropColsByCor(df, cor_cutoff): tdf = df dsu_dict = {} string_cols = [] for (a, b) in df.dtypes: if b == 'string': string_cols.append(a) for cols in string_cols: tdf = tdf.drop(cols) num_cols = len(tdf.columns) dsu = [i for i in range(num_cols)] size = [1 for i in range(num_cols)] features = tdf.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(features, method="pearson") for i in range(num_cols): for j in range(i): if corr_mat[i][j] > cor_cutoff: union(dsu, size, i, j) drop_cols = [] for i in range(num_cols): if dsu[i] != i: drop_cols.append(tdf.columns[i]) #Setting up dictionary to save up on iterations if dsu[i] == i: dsu_dict[tdf.columns[i]] = [tdf.columns[i]] for i in range(num_cols): if dsu[i] != i: ri = root(dsu, i) dsu_dict[tdf.columns[ri]].append(tdf.columns[i]) for cols in drop_cols: tdf = tdf.drop(cols) string_df = df.select(string_cols) #Adding index to help merge both string and numeric dataframes tdf = tdf.withColumn("RowNoIndex", monotonically_increasing_id()) string_df = string_df.withColumn("RowNoIndex", monotonically_increasing_id()) tdf = tdf.join(string_df, ['RowNoIndex']) tdf = tdf.drop('RowNoIndex') return dsu_dict, tdf
def corr(sdf_) -> pd.DataFrame: """Calculate correlation of data :param sdf_: pyspark dataframe :return: Correlation matrix in a pamdas dataframe """ col_names = sdf_.columns features = sdf_.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(features, method="pearson") corr_df = pd.DataFrame(corr_mat) corr_df.index, corr_df.columns = col_names, col_names return corr_df
def CorrelationFeature(vectors): # print 'Calculation Correlation' matriz = sc.broadcast(Statistics.corr(vectors, method="pearson")) summary = Statistics.colStats(vectors) varianza = summary.variance() #########new heuristic diogo proposal w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = varianza[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) #features sorted #print r # print 'calculating features selections' #Old heuristic # # w={} # # for i in range(len(matriz)): # # w[i]=0 # # for j in np.nan_to_num(matriz[i]): # # k=abs(j) # # w[i]=w[i]+k # r=sorted([(value,key) for (key,value) in w.items()],reverse=True) #####"" #vectors=np.matrix(vectors) #beforeMatrix=vectors.map(lambda x: np.matrix(x)) index = [] for i in r: index.append(i[1]) index = index[0:6] #tacking the first 6 features #MatrixReducer(vectors,index) return index
def _compute_corr_matrix(spark_df, corr_method='pearson'): """ A helper function for computing a correlation matrix of a spark dataframe (works only with numeric columns). The correlation matrix represents the pair correlation of all the variables. By default the method will use Pearson correlation (a measure of the linear correlation between two variables X and Y, it has a value between +1 and -1, where 1 is total positive linear correlation, 0 is no linear correlation, and -1 is total negative linear correlation). The correlation matrix is computed with Spark. Args: :spark_df: the spark dataframe to compute the correlation matrix for :method: the correlation method, defaults to pearson (spearman supported as well) Returns: a pandas dataframe with the correlation matrix Raises: :ValueError: when the provided dataframe is of a structure that can't be used for computing correlations. """ numeric_columns = spark_df.dtypes if (len(numeric_columns) == 0): raise ValueError( "The provided spark dataframe does not contain any numeric columns. " "Cannot compute feature correlation on categorical columns. The numeric datatypes are: {}" " and the number of numeric datatypes in the dataframe is: {} ({})" .format(constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) if (len(numeric_columns) == 1): raise ValueError( "The provided spark dataframe only contains one numeric column. " "Cannot compute feature correlation on just one column. The numeric datatypes are: {}" "and the number of numeric datatypes in the dataframe is: {} ({})". format(constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES, len(spark_df.dtypes), spark_df.dtypes)) if (len(numeric_columns) > constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS): raise ValueError("The provided dataframe contains {} columns, " "feature correlation can only be computed for " "dataframes with < {} columns due to scalability " "reasons (number of correlatons grows " "quadratically with the number of columns)" \ .format(len(numeric_columns), constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS)) spark_df_rdd = spark_df.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(spark_df_rdd, method=corr_method) pd_df_corr_mat = pd.DataFrame(corr_mat, columns=spark_df.columns, index=spark_df.columns) return pd_df_corr_mat
def correlation(df, target_col): # drop string columns columns_to_drop = [ item[0] for item in df.dtypes if item[1].startswith('string') ] df_numeric = df.drop(*columns_to_drop) # generate correlation matrix features = df_numeric.rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(features, method="pearson") corr_df = pd.DataFrame(corr_mat) corr_df.index, corr_df.columns = df_numeric.columns, df_numeric.columns corr_df = corr_df[target_col] return corr_df
def _run_all_in_master_memory(self, method): """ Run the spark pearson correlation by loading all the TS content (ie. values) in master memory Each coefficient will be computed by a worker (Spark decides the best choice to apply) """ # Create or get a spark Context spark_context = ScManager.get() # Get TS content rdd_content = self._get_ts(spark_context) # Job distribution is made by Statistics.corr (Spark correlation matrix calculation) self.results = Statistics.corr(rdd_content, method=method) ScManager.stop()
def create_or_update_week(influencer_tweets, topic_tweets, week): topic_cor = [] influencer_cor = [] for t in topic_tweets: for i in influencer_tweets: if t['time'] == i['time']: topic_cor.append(t['count']) influencer_cor.append(i['count']) if len(topic_cor)<=1: corr = 0 else: sc = SparkContext(appName="CorrelationPerWeek") topic_tweets = sc.parallelize(topic_cor) influencer_tweets = sc.parallelize(influencer_cor) corr = Statistics.corr(topic_tweets, influencer_tweets, "pearson") sc.stop() url = "http://localhost:8000/api/weeks/" today = datetime.fromtimestamp(week/1000.0) payload = '{ "score": %f, "start_date": "%s" }' % ( float(corr), str(today.year) + "-" + str(today.month) + "-" + str(today.day)) headers = { 'authorization': "Basic ZGV2OjEyMzQ=", 'content-type': "application/json", 'cache-control': "no-cache", 'postman-token': "7c8668c0-a4c2-f42d-66a9-95cbfb7806c5" } try: response = requests.request("POST", url, data=payload, headers=headers) return response.json()['id'] except: print "error" return 1
def CorrelationFeature(vectors, schema): print("Calculating Correlation") vectors_rdd = vectors.rdd.map( lambda row: Vectors.dense([x for x in row["features"]])) matriz = spark.sparkContext.broadcast( Statistics.corr(vectors_rdd, method="pearson")) summary = Statistics.colStats(vectors_rdd) variance = summary.variance() ######## Heurística ######## w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = variance[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) index = r[0:6] a = [] for i in index: a.append((0, int(i[1]))) red = MatrixReducer(vectors_rdd, a, schema) return red
def get_language_correlation(): """ calculates the correlation between github languages """ #Create Spark Context sc = SparkContext(appName="LanguageCorrelations") #Create SQL Context sqlCtx = SQLContext(sc) #Create a schemaRDD from json datasets stored in HDFS pushes = sqlCtx.jsonFile('git_14_15/git_results') #Register the schemaRDD as a Table pushes.registerTempTable('pushes') #filter the data to get the pushes for the languages from LANG filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG))) #perform map transformation to get the rdd in the format (actor, {lang : pushes}) f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes})) #group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...]) f_group = f_pair.groupByKey() #merge lang dictionries to get single orderd dict per actor f_merged = f_group.map(lambda s: merge_lang_dict(s[1])) #created rdd of vectors from the pushes values, which is required for the correlation algorithm vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values()))) #call the correlation function matrix = Statistics.corr(vectors) print matrix plot_graph(matrix) sc.stop()
# Matrice de corrélation # print(df.corr()) # ### Mllib Statistics # In[5]: from pyspark.mllib.stat import Statistics # Basics Statistics partsNum = parts.map(lambda line: line[0:8]) summary = Statistics.colStats(partsNum) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros()) Statistics.corr(partsNum, method="pearson") # # Classification supervisée # ## Naive Bayes # In[6]: from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel import utils_mesure nomF_svm = "glass_svm" data = sc.textFile("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF_svm+".csv") # suppression du header nomColInit = data.first()
def pearson(self): return Statistics.corr(self.a, self.b, 'pearson')
# each vector represents a year, with values corrresponding to crimes for each beat # years are the "rows", and beats are "columns" crimesVectors = crimesByYear3.map(lambda x: Vectors.dense(x[1])) crimes3.unpersist() #### Correlation ########################################################## # If a single RDD of Vectors is passed in, a correlation # matrix comparing the columns in the input RDD is returned. # If you want to explore your data it is best to compute both, since # the relation between the Spearman (S) and Pearson (P) correlations will give some information. Briefly, # S is computed on ranks and so depicts monotonic relationships while P is on true values and depicts linear relationships. # http://stats.stackexchange.com/questions/8071/how-to-choose-between-pearson-and-spearman-correlation pearsonCorr = Statistics.corr(crimesVectors) spearmanCorr = Statistics.corr(crimesVectors, method="spearman") print pearsonCorr print spearmanCorr type(pearsonCorr) # Check dimension should be #beats, #beats pearsonCorr.shape # create correlation dictionary function def createCorrDic(corr): """ Key: (i,j), Value: value So (i,j) represents to index for beats, value the correlation value between them @ param: correlation matrix
import numpy as np from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="CorrelationsExample") # SparkContext # $example on$ seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series # seriesY must have the same number of partitions and cardinality as seriesX seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) data = sc.parallelize( [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])] ) # an RDD of Vectors # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print(Statistics.corr(data, method="pearson")) # $example off$ sc.stop()
from pyspark.sql import HiveContext from pyspark.mllib.stat import Statistics from pyspark import SparkContext sc = SparkContext() sqlContext = HiveContext(sc) initialquery = sqlContext.sql("SELECT A.avg_procedure_score, B.patientsurveyscore FROM (SELECT p.hospitalid, avg(p.score) as avg_procedure_score FROM procedures p GROUP BY p.hospitalid) A JOIN survey_results B ON B.hospitalid = A.hospitalid") survey_score = initialquery.map(lambda x: x.patientsurveyscore) avg_procedure_scores = initialquery.map(lambda x: x.avg_procedure_score) print Statistics.corr(avg_procedure_scores, survey_score, method="pearson")
np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0]) ]) summary = Statistics.colStats(mat) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros()) ## correlation # vectors seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 55.0]) print('Pearson correlation is: {}'.format( Statistics.corr(seriesX, seriesY, method='pearson'))) print('Spearman correlation is: {}'.format( Statistics.corr(seriesX, seriesY, method='spearman'))) # matrix print('Correlation of matrix: {}'.format(Statistics.corr(mat, method='pearson'))) ## sampling # sampling methods can be performed on RDD's of key-value pairs data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) fractions = {1: 0.1, 2: 0.6, 3: 0.3} approxSample = data.sampleByKey(False, fractions)
#Remove the first line header=rddUSD.first() dataLines = rddUSD.filter(lambda x: x != header) dataLines.take(5) usdVectors = dataLines.map(transformationLR.transformToNumeric) #Perform statistical Analysis usdStats=Statistics.colStats(usdVectors) usdStats.mean() usdStats.variance() usdStats.min() usdStats.max() Statistics.corr(usdVectors) #Transform to a Data Frame for input to Machine Learing #Drop columns that are not required (low correlation) usdLP = usdVectors.map(transformationLR.transformToLabeledPoint) usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"]) usdDF.select("label", "features").show(10) #Split into training and testing data (trainingData, testData) = usdDF.randomSplit([0.7, 0.3]) trainingData.count() testData.count() #Build the model on training data lr = LinearRegression(maxIter=10)
# Vectorise the data and drop na's. features_vector = assembler.transform( features_subset.na.drop()).select(vector_col) # Get correlation matrix corr_matrix = Correlation.corr(features_vector, vector_col) # Output the matrix. corr_matrix.collect()[0]["pearson({})".format(vector_col)].values # Other correlation implementation. from pyspark.mllib.stat import Statistics col_names = features_subset.columns features = features_subset.na.drop().rdd.map(lambda row: row[0:]) corr_mat = Statistics.corr(features, method="pearson") corr_df = pd.DataFrame(corr_mat) corr_df.index, corr_df.columns = col_names, col_names print(corr_df.to_string()) # For a correlation plot of all variables in the features dataset, see the Python script "Corr_plot.ipynb". # For the first 5 columns, we see that Area_Method_of_Moments_Overall_Standard_Deviation_4 is highly # correlated with Area_Method_of_Moments_Overall_Standard_Deviation_5 with a correlation coefficient of # 0.946 (3dp), and Area_Method_of_Moments_Overall_Standard_Deviation_2 is highly correlated with # Area_Method_of_Moments_Overall_Standard_Deviation_4 with a correlation coefficient of 0.849 (3dp). ########################## # Define the schema of the MAGD dataset. MAGD_schema = StructType([
from pyspark.sql import SQLContext from pyspark import SparkContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.stat import Statistics data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ), (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ), (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )] sc = SparkContext("local", "sample") sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(data, ["features"]) r1 = Statistics.corr(df, method="features").head() print("Pearson correlation matrix:\n" + str(r1[0])) r2 = Statistics.corr(df, "features", "spearman").head() print("Spearman correlation matrix:\n" + str(r2[0]))
summary = Statistics.colStats(mat) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros()) print(summary.max()) print(summary.min()) print(summary.count()) print(summary.normL1()) print(summary.normL2()) #correlation x = sc.parallelize(np.random.randn(4, 1)) y = sc.parallelize(np.random.randn(4, 1)) print("Correlation :", str(Statistics.corr(x, y))) #Chi-square #For Vector x = Vectors.dense(np.random.random_sample((5))) y = Vectors.dense(np.random.random_sample((5))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis) # For Matrices x = Matrices.dense(4, 2, np.random.random_sample((8))) y = Matrices.dense(4, 2, np.random.random_sample((8))) chisqr = Statistics.chiSqTest(x, y)
def rho(self): return Statistics.corr(self.a, self.b, 'spearman')
withstations = tidy.rdd.map(lambda row: Row(station=map_yx_to_station(row.yx), datehour=row.datehour)).toDF() withstations.registerTempTable('stationincidents') incidentcount = sqlc.sql( "select station, datehour, count(1) as incidents from stationincidents group by station, datehour" ) print("we now have incidents by station/hour in incidentcount") incidentcount.show(10) # now join the two tables joined = cleanedaverages.join(incidentcount, ['station', 'datehour'], 'outer') # if incident data doesn't exist for that station/datehour, then it is 0 zeroed = joined.rdd.map(lambda row: Row(station=row.station, datehour=row.datehour, temp=row.temp, wind=row.wind, incidents=row.incidents if row.incidents else 0)).toDF() # if temp/wind data doesn't exist for that station/datehour, then we can't use that row final = zeroed.filter(zeroed.temp.isNotNull()).filter( zeroed.wind.isNotNull()).filter(zeroed.temp != 0) # finally apply correlation test vecs = final.rdd.map( lambda row: Vectors.dense([row.temp, row.wind, row.incidents])) print(Statistics.corr(vecs))
from pyspark.mllib.stat import Statistics summary = Statistics.colStats(rideRDD) print "Duration\tMorning\tAfternoon\tEvening\tWeekday\tMale\tAge\n" print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.mean()) print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.variance()) print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.numNonzeros()) # #### 1(c) Determine correlation of Age with Duration # In[3]: durationRDD = rideRDD.map(lambda x : x[0]) # Extract duration from the RDD ageRDD = rideRDD.map(lambda x : x[6]) # Extract Age from the RDD print(Statistics.corr(durationRDD, ageRDD, method="pearson")) # Print the Pearson correlation of Age vs. Duration # ### ** Part 2: Linear Regression ** # #### ** (2a) Plotting ** # # # In[4]: # Plot Age Vs. Duration get_ipython().magic(u'matplotlib inline') import matplotlib.pyplot as plt plt.scatter(ageRDD.collect(), durationRDD.collect(),alpha=0.5) plt.xlabel="Age"
[[37.7816834,-122.3887657],\ [37.7469112,-122.4821759],\ [37.7411022,-120.804151],\ [37.4834543,-122.3187302],\ [37.7576436,-122.3916382],\ [37.7970013,-122.4140409],\ [37.748496,-122.4567461],\ [37.7288155,-122.4210133],\ [37.5839487,-121.9499339],\ [37.7157156,-122.4145311],\ [37.7329613,-122.5051491],\ [37.7575891,-122.3923824],\ [37.7521169,-122.4497687]])), ["SF18", "SF04", "SF15", "SF17", "SF36", "SF37",\ "SF07", "SF11", "SF12", "SF14", "SF16", "SF19", "SF34"] ),d,h)) located = located.map(lambda (l,d,h): ((d,h,l),1)) located = located.reduceByKey(lambda a, b : a + b) joined = located.join(reducedTuple) print joined.first() from pyspark.mllib.linalg import Vectors from pyspark.mllib.stat import Statistics vecs = joined.map(lambda ((d,h,s),(c,(w,t))): Vectors.dense([t,w,c])) print(Statistics.corr(vecs))
return year_beat_crimes years_crimes = filtered_more.flatMap(lambda x: (collapse_years(x))) #print (years_crimes.collect()) # [(2001, 1584), (2002, 1327), (2003, 1272), (2004, 1164), (2005, 1089), # (2006, 1090), (2007, 1313), (2008, 1167), (2009, 1207), (2010, 1132), # (2011, 1028), (2012, 1430), (2013, 1625), (2014, 1616), (2015, 526), # (2001, 1720), (2002, 1679), (2003, 1691), (2004, 1412), (2005, 1172), # (2006, 1169), (2007, 1260), (2008, 1541), (2009, 1583), (2010, 1432), # (2011, 1327), (2012, 1124), (2013, 942), (2014, 891), (2015, 339), years_crimes_grouped = years_crimes.groupByKey() for_vectors = years_crimes_grouped.mapValues(lambda x: sorted(list(x), key=lambda y: y)) crime_vectors = for_vectors.map(lambda x: Vectors.dense([x[1]])) beat_correlations = Statistics.corr(crime_vectors,method="pearson") # print (np.shape(beat_correlations)) # produces a 254x254 array of correlations def get_max_corr(x): max_corr = 0 for (i,j), values in np.ndenumerate(x): if x[i][j] > max_corr and x[i][j] < 1 and abs(i-j) == 1: max_corr = x[i][j] return max_corr max_correlation = get_max_corr(beat_correlations) max_indices = np.where(beat_correlations==max_correlation) #print (max_correlation) #print (max_indices)
header = raw_data.first() raw_data = raw_data.filter(lambda x: x != header) def parse_interaction(line): #split lines based on the delimeter, and create a list line_split = line.split(",") #replace NA with zeros line_split = [w.replace('NA', '0') for w in line_split] #remove year, and other non-numeric data """ 0 = Year 11 = ActualElapsedTime 12 = CRSElapsedTime 13 = AirTime 16 = Distance """ symbolic_indexes = [0, 8, 10, 11, 12, 13, 16, 17, 18, 22] clean_line_split = [ item for i, item in enumerate(line_split) if i not in symbolic_indexes ] return np.array([float(x) for x in clean_line_split]) vector_data = raw_data.map(parse_interaction) #start timer at this point startTime = datetime.now() print(Statistics.corr(vector_data, method="pearson")) print('Time consumed = '), (datetime.now() - startTime)
merged_final = merged.reduceByKey(lambda x,y : int(x) + int(y)) #sort by month-year # Map each year to all beats and their corresponding crime counts for that year, and sort the counts # by beat groupedbeatCountsbymonthyear = merged_final.map( lambda row: ( row[ 0 ][ 1 ], ( row[ 0 ][ 0 ], row[ 1 ] ) ) ) \ .groupByKey( ) \ .mapValues( lambda val: sorted( list( val ), key = lambda t: t[ 0 ] ) ); # Create a list of all beats groupbeats = [ elem[ 0 ] for elem in groupedbeatCountsbymonthyear.values( ).first( ) ]; beatvectorCounts = groupedbeatCountsbymonthyear.values( ) \ .map( lambda row: Vectors.dense( [ elem[ 1 ] for elem in row ] ) ); # Compute correlation between all beats for yearly crime counts corrMatrix = Statistics.corr( beatvectorCounts, method = 'pearson' ); # Fill the diagonal of correlation matrix with 0's corrMatrix.flags[ 'WRITEABLE' ] = True; np.fill_diagonal( corrMatrix, 0.0 ); # Get the 10 largest correlation values from the matrixr The correlation matrix is symmetric so # we take the largest 20 and step by 2. Finally, the index of the corresponding beat pairs for # top 10 correlation values is obtained. sortOrder = corrMatrix.argsort( axis = None ); indices = np.unravel_index( sortOrder[ -20::2 ], corrMatrix.shape ); # The corresponding beats names are obtained for the top 10 correlated beat pairs topBeatPairs = [ ( groupbeats[ i ], groupbeats[ j ] ) for i, j in zip( indices[ 0 ], indices[ 1 ] ) ]; for i, j in topBeatPairs:
df_care_max_scores = df_care.groupBy('measure_id').max().collect() df_care_max_scores = sc.broadcast(df_care_max_scores) # function to extract max_score for each measure_id def get_max_score(id): return [score[1] for score in df_care_max_scores.value if score[0] == id][0] # creating a new RDD containing extra column for normalized score # that is ratio of current score with maximum score for the measure_id df_care = df_care.map(lambda p: Row( provider_id = p[0], measure_id = p[1], score = p[2], normalized_score = float(p[2]) / get_max_score(p[1]) ) ) # creating dataframe from the RDD df_care = sqlCtx.createDataFrame(df_care) # get total normalized score per hospital df_care = df_care.groupBy('provider_id').sum('normalized_score').map(lambda p: Row( provider_id = p[0], care_score = p[1])) df_care = sqlCtx.createDataFrame(df_care) # merge df_care and df_survey table df = df_care.join(df_survey, df_care.provider_id == df_survey.provider_id).select(df_survey.provider_id,df_survey.survey_score, df_care.care_score) survey_care_rdd = df.map(lambda p : (p.survey_score, p.care_score)) correlation = Statistics.corr(survey_care_rdd) print 'correlation between hospital quality score and survey responses : %.3f'% (correlation[0][1])
from datetime import datetime sc = SparkContext(appName= "Run 1 - Corr-Wide - Data95-08 - AWS") data_file = "s3://aws-logs-012060642840-us-west-2/elasticmapreduce/cloud_proj/95-08.csv" raw_data = sc.textFile (data_file).cache () #extract header header = raw_data.first () raw_data = raw_data.filter (lambda x:x != header) def parse_interaction(line): #split lines based on the delimeter, and create a list line_split = line.split (",") #replace NA with zeros line_split = [w.replace ('NA', '0') for w in line_split] #remove year, and other non-numeric data """ 0 = Year """ symbolic_indexes = [0, 8, 10, 16, 17, 22] clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes] return np.array ([float (x) for x in clean_line_split]) vector_data = raw_data.map (parse_interaction) #start timer at this point startTime = datetime.now() print (Statistics.corr (vector_data, method="pearson")) print ('Time consumed = '), (datetime.now() - startTime) sc.stop()
beatYrComboCount = file1.map(lambda x: ( (x[10], int(x[17])) , 1)).reduceByKey(lambda x,y: x + y).cache() # List of beat-yr combos beatYr = beatYrComboCount.keys().flatMap(lambda x: [(x[0], y) for y in range(2001,2016)]) # Find missing beats and union with non-missing to create full set missing = beatYr.subtract(beatYrComboCount.keys()).distinct() allCrimeCnts = beatYrComboCount.union(missing.map(lambda x: (x,0))) file_cnts = allCrimeCnts.map(lambda x: (x[0][1], (x[0][0], x[1]))).groupByKey().mapValues(lambda x: sorted(list(x),key = lambda x: x[0])).cache() # List of beats beats = [element[0] for element in file_cnts.values().first()] vectorCnts = file_cnts.values().map(lambda x: Vectors.dense([ element[1] for element in x])) cor = Statistics.corr(vectorCnts, method='pearson') cor.flags['WRITEABLE'] = True; np.fill_diagonal(cor, 0.0) # Get top 10 correlation values from matrix sorted = cor.argsort(axis=None) ind = np.unravel_index(sorted[-20::2], cor.shape) mostCorrBeatPairs = [(beats[i], beats[j]) for i,j in zip(ind[0], ind[1])] for i,j in mostCorrBeatPairs: print i,j
cord = orders.select("order_id", "user_id") ordpr = order_products.select("order_id", "product_id") f = ordpr.join(cord, "order_id") more = cord.groupby("user_id").count() larger = f.groupby("user_id").count() more = more.sort(asc("user_id")) more.show() mo = more.select("count") mo = mo.collect() larger = larger.sort(asc("user_id")) la = larger.select("count") la = la.collect() #A = more.join(larger, more.user_id == larger.user_id) #a =more.select("count") #a = more.collect() #a = np.array(a) #b = larger.select("count") #b = more.collect() #b = np.array(b) # from pyspark.mllib.stat import Statistics r1 = Statistics.corr(sc.parallelize(mo), sc.parallelize(la), method="pearson").head() print("Correlation\n" + str(r1))
__author__ = 'Michael' from pyspark.mllib.stat import Statistics api_mentions = sc.parallelize([ 23, 35, 56]) mpetyx_tweets = sc.parallelize([ 45, 67, 76]) corr = Statistics.corr(api_mentions, mpetyx_tweets, "pearson")
def readRankMatrix(): import numpy as np lines = sc.textFile('../yelp_trans.csv') rawData = lines.mapPartitionsWithIndex(removeHeader) mydata = rawData.map(removeColumns).cache() return mydata from pyspark.mllib.stat import Statistics from pandas import Series import pandas as pd import numpy as np import math mydata = readRankMatrix() corr = Statistics.corr(mydata) # set up the columns names and add a new names called user_id lines2 = sc.textFile('../yelp.csv') names = lines2.map(lambda line:line.split(",")).map(lambda a:a[0]).collect()[1:] s = Series([str for str in names]) pddata = pd.DataFrame(corr, columns=s) pddata['user_id'] = names df_corr = sqlContext.createDataFrame(pddata) # df_corr.cache() df_corr.registerTempTable("corr") def getTopReviewUsers(n): # n: the nth highest user ord_user = sqlContext.sql("select user_id, count(review_id) as count from reviews_json group by user_id order by count desc")
if __name__ == "__main__": if len(sys.argv) not in [1, 2]: print("Usage: correlations (<file>)", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonCorrelations") if len(sys.argv) == 2: filepath = sys.argv[1] else: filepath = 'data/mllib/sample_linear_regression_data.txt' corrType = 'pearson' points = MLUtils.loadLibSVMFile(sc, filepath)\ .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray())) print() print('Summary of data file: ' + filepath) print('%d data points' % points.count()) # Statistics (correlations) print() print('Correlation (%s) between label and each feature' % corrType) print('Feature\tCorrelation') numFeatures = points.take(1)[0].features.size labelRDD = points.map(lambda lp: lp.label) for i in range(numFeatures): featureRDD = points.map(lambda lp: lp.features[i]) corr = Statistics.corr(labelRDD, featureRDD, corrType) print('%d\t%g' % (i, corr)) print() sc.stop()
out.append(int(l[i]) - int(l[i - 1])) return out if __name__ == "__main__": cov = sys.argv[1] senti = sys.argv[2] #output = sys.argv[2] sc = SparkContext("local", "death_trend") covid = sc.textFile(cov).map(lambda line: parseLine(line).split( ",")).filter(lambda line: line[-1].isdigit()).map( lambda line: ["global", dailyIncrease(line[60::])]).reduceByKey( combineSeries).collect() covid = sc.parallelize(covid[0][1]) maximum = covid.max() covid = covid.map(lambda each: float(each) / float(maximum)) covid.saveAsTextFile("covid") sent = sc.textFile(senti).map(lambda line: line.split(" ")).map( lambda line: [int(line[0][14:-4]), float(line[3])]).collect( ) #.saveAsTextFile("senti_processed") sorted_senti = sorted(sent) sorted_senti = sc.parallelize(sorted_senti).map(lambda x: x[1]) max_senti = sorted_senti.max() sorted_senti = sorted_senti.map(lambda x: float(x) / float(max_senti)) sorted_senti.saveAsTextFile("sorted_senti") #print(sorted_senti) print(Statistics.corr(covid, sorted_senti))
def dist_corr(v1, v2): """ Function to compute correlation between two Spark RDDs """ return Statistics.corr(v1,v2)
if __name__ == "__main__": #Simple Correlation sc = pyspark.SparkContext() csv = np.genfromtxt('C:\\SBU\\Fall2017\\BigData\\Project\\BDProject\\final_test_selfHarm.csv', delimiter=",",skip_header=3) second = csv[:, 10].astype(float) print(second[:3]) third = csv[:, 17].astype(float) print(third[:3]) a = [] b = [] for i in range(len(second)): if (math.isnan(second[i]) or math.isnan(third[i])): continue else: a.append(second[i]) b.append(third[i]) a = np.asarray(a) b = np.asarray(b) seriesX = sc.parallelize(a) seriesY = sc.parallelize(b) corrval = Statistics.corr(seriesX, seriesY, method="pearson") print(corrval)
""" Testing with Correlation https://spark.apache.org/docs/latest/mllib-statistics.html """ from pyspark.mllib.stat import Statistics from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import SparseVector, Vectors sc = SparkContext("local", "Rubbish") seriesX = sc.parallelize([1.0, 2.0, -2.0], 2) seriesY = sc.parallelize([3.0, 4.0, 5.0], 2) corrXY = Statistics.corr(seriesX, seriesY, method="pearson") # RDD of Vectors data = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) print "Correlation between x & y: ", corrXY print "Correlation matrix: ", data
bundle_pearson_dict = {} #dictionary to hold the bundle as key and the coeff as value for bundle_name in actual_bundle_list: final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = \""+bundle_name+"\"") food_metric_only= final_table_by_bundle.map(lambda p: p.zip_AGI_foodmetric[2]) food_metric_list = food_metric_only.collect() weighted_AGI_only= final_table_by_bundle.map(lambda p: p.zip_AGI_foodmetric[1]) weighted_AGI_list = weighted_AGI_only.collect() if not food_metric_list and not weighted_AGI_list: print 'pass' else: x=sc.parallelize(weighted_AGI_list,2) y=sc.parallelize(food_metric_list,2) correlation_coeff = Statistics.corr(x,y, method="pearson") # -0.128161962745 or is it -0.0965926041863?? bundle_pearson_dict[bundle_name]= correlation_coeff bundle_pearson_dict #to get all coeff values by bundle # In[53]: #Here I have an example scatter plot for bundle_name = 'vegetables' to have an idea of how the plot looks # x is the AGI for every zip code # y is the food metric #an example plot is also available to be viewed in the parent folder final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = 'vegetables'") food_metric_only= final_table_by_bundle.map(lambda p: p.zip_AGI_foodmetric[2]) food_metric_list = food_metric_only.collect()
#keep only Cyl,Displacement autoVectors=dataLines.map(transformToNumeric) autoVectors.collect() #perform analysis from pyspark.mllib.stat import Statistics from pyspark.sql import SQLContext(sc) autostats=Statistics.colStats(autoVectors) autosatats.mean() autosatats.Variance() autosatats.min() autosatats.max() Statistics.corr(autoVector) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) def transformToLabelPoint(inStr): lp =(float(inStr[0]),vectors.dense(inStr[1])) return lp autolp = autovectors.map(transformToLabledPoint) autodf = sqlContext.createDataFrame(autoLp,["label","features") autodf.select("label","features").show(10) #SPLIT
# $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="CorrelationsExample") # SparkContext # $example on$ seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series # seriesY must have the same number of partitions and cardinality as seriesX seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) data = sc.parallelize([ np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0]) ]) # an RDD of Vectors # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print(Statistics.corr(data, method="pearson")) # $example off$ sc.stop()
dataLines = rddUSD.filter(lambda x: x != header) dataLines.count() dataLines.first() dataLines.take(5) #RDD to Dense vector vectorsUSD = dataLines.map(transformationDT.transformToNumeric) vectorsUSD.take(5) #Perform statistical Analysis statsUSD=Statistics.colStats(vectorsUSD) statsUSD.mean() statsUSD.variance() statsUSD.min() statsUSD.max() Statistics.corr(vectorsUSD) #SPARK SQL dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",") dataframe.registerTempTable("dataUSDuprv") dff1=sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show() dataframe.show() #LabeledPoint lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint) lpUSD.take(5) dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"]) dfUSD.select("label", "features").show(10) #String Indexer