def spark_pearson(a, b):
     rdd_a = sc.parallelize(a)
     rdd_b = sc.parallelize(b)
     g = func.func_globals
     g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson')
     g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman')
     func(a, b)
 def spark_pearson(a, b):
     rdd_a = sc.parallelize(a)
     rdd_b = sc.parallelize(b)
     g = func.func_globals
     g['pearson'] = Statistics.corr(rdd_a, rdd_b, 'pearson')
     g['rho'] = Statistics.corr(rdd_a, rdd_b, 'spearman')
     func(a, b)
示例#3
0
def CorrelationFeature(vectors):

    matriz = sc.broadcast(Statistics.corr(vectors, method="pearson"))

    summary = Statistics.colStats(vectors)

    varianza = summary.variance()

    #########new heuristic diogo proposal
    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = varianza[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()],
               reverse=True)  #features sorted

    index = []
    for i in r:
        index.append(i[1])

    index = index[0:6]  #tacking the first 6 features

    return index
    def _transform(self, df):

        for k, v in df.schema[
                self.inputCol].metadata["ml_attr"]["attrs"].items():
            features_df = pd.DataFrame(v)

        column_names = list(features_df['name'])
        df_vector = df.rdd.map(lambda x: x[self.inputCol].toArray())

        #self.correlation_type is class parameter
        matrix = Statistics.corr(df_vector, method=self.correlation_type)

        # apply pandas dataframe operation on the fit output
        corr_df = pd.DataFrame(matrix,
                               columns=column_names,
                               index=column_names)
        final_corr_df = pd.DataFrame(corr_df.abs().unstack().sort_values(
            kind='quicksort')).reset_index()
        final_corr_df.rename(
            {
                'level_0': 'col1',
                'level_1': 'col2',
                0: 'correlation_value'
            },
            axis=1,
            inplace=True)
        final_corr_df = final_corr_df[
            final_corr_df['col1'] != final_corr_df['col2']]

        #shortlisted dataframe based on custom cutoff
        shortlisted_corr_df = final_corr_df[
            final_corr_df['correlation_value'] > self.correlation_cutoff]
        return corr_df, shortlisted_corr_df
 def compute_correlation_matrix(df, method='pearson'):
     columns=[item[0] for item in df.dtypes if (item[1].startswith('float') or item[1].startswith('double'))]#need to work according to the datatypes
     df_filter=df.select(columns)
     df_rdd = df_filter.rdd.map(lambda row: row[0:])
     corr_mat = Statistics.corr(df_rdd, method=method)
     corr_mat_df = pd.DataFrame(corr_mat,columns=df_filter.columns,index=df_filter.columns)
     return corr_mat_df
示例#6
0
def compute_correlation_matrix(df, method='pearson'):
    # wrapper around
    # https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
    df_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(df_rdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns, index=df.columns)
    return corr_mat_df
示例#7
0
def calculateCorrelation(rdd1, rdd2):
    joined_rdd = rdd1.join(rdd2).sortByKey()

    rdd1_values = joined_rdd.map(lambda x:x[1][0])
    rdd2_values = joined_rdd.map(lambda x:x[1][1])
    correlation_value = Statistics.corr(rdd1_values, rdd2_values)
    return (joined_rdd,correlation_value)
示例#8
0
def compute_correlation_matrix(df, method='pearson'):
    df_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(df_rdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat,
                    columns=df.columns, 
                    index=df.columns)
    return corr_mat_df
def correlationTemperatureHardness(df,spark):
    column1 = df.select('temperature').rdd.map(lambda x: x['temperature']).filter(lambda x: x is not None).filter(lambda x: x != '')
    column2 = df.select('hardness').rdd.map(lambda x: x['hardness']).filter(lambda x: x is not None).filter(lambda x: x != '')
    data = column1.zip(column2)
    corr_matrix = Statistics.corr(data)
    
    return corr_matrix[1][0]
def compute_correlation_matrix(df,method='spearman'):
    
    churn_data3_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(churn_data3_rdd, method=method)
    corr_mat_churn_data3 = pd.DataFrame(corr_mat,
                    columns=df.columns, 
                    index=df.columns)
    return corr_mat_churn_data3
示例#11
0
def estimate_correlation_matrix(df, cols, method='pearson', round_decimals=3):

    features = df.select(cols).rdd.map(lambda row: row[0:])
    corr_mat= pd.DataFrame(
        Statistics.corr(features, method=method), columns=cols, index=cols) \
        .round(round_decimals) \
        .style \
        .background_gradient(cmap='coolwarm')

    return corr_mat
示例#12
0
def correlations(sdf, colnames, method='pearson', ax=None, plot=True):
    sdf = sdf.notHandy()
    correlations = Statistics.corr(sdf.select(colnames).dropna().rdd.map(lambda row: row[0:]), method=method)
    pdf = pd.DataFrame(correlations, columns=colnames, index=colnames)
    if plot:
        if ax is None:
            fig, ax = plt.subplots(1, 1)
        return sns.heatmap(round(pdf,2), annot=True, cmap="coolwarm", fmt='.2f', linewidths=.05, ax=ax)
    else:
        return pdf
示例#13
0
def main():
    ###Loading data from sources
    print 'before  preprocess'
    data = [preprocess(input_file)]
    print 'after preprocess'
    #get spark context
    sc = getSparkContext()
    print 'before parallelize'
    data = np.hstack((data[0]['train_data'], data[0]['train_labels'].reshape(
        (data[0]['train_labels'].shape[0], 1))))
    data = [
        Vectors.dense(list(data[row, :])) for row in range(0, data.shape[0])
    ]
    samples = sc.parallelize(data)
    #samples.persist()
    pearsonCorr = Statistics.corr(samples)
    print str(pearsonCorr).replace('nan', 'NaN')
    sys.exit()
    print Statistics.corr(data, method="pearson")
示例#14
0
def DropColsByCor(df, cor_cutoff):

    tdf = df
    dsu_dict = {}

    string_cols = []
    for (a, b) in df.dtypes:
        if b == 'string':
            string_cols.append(a)

    for cols in string_cols:
        tdf = tdf.drop(cols)

    num_cols = len(tdf.columns)
    dsu = [i for i in range(num_cols)]
    size = [1 for i in range(num_cols)]

    features = tdf.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(features, method="pearson")

    for i in range(num_cols):
        for j in range(i):
            if corr_mat[i][j] > cor_cutoff:
                union(dsu, size, i, j)

    drop_cols = []
    for i in range(num_cols):
        if dsu[i] != i:
            drop_cols.append(tdf.columns[i])

        #Setting up dictionary to save up on iterations
        if dsu[i] == i:
            dsu_dict[tdf.columns[i]] = [tdf.columns[i]]

    for i in range(num_cols):
        if dsu[i] != i:
            ri = root(dsu, i)
            dsu_dict[tdf.columns[ri]].append(tdf.columns[i])

    for cols in drop_cols:
        tdf = tdf.drop(cols)

    string_df = df.select(string_cols)

    #Adding index to help merge both string and numeric dataframes
    tdf = tdf.withColumn("RowNoIndex", monotonically_increasing_id())
    string_df = string_df.withColumn("RowNoIndex",
                                     monotonically_increasing_id())
    tdf = tdf.join(string_df, ['RowNoIndex'])
    tdf = tdf.drop('RowNoIndex')

    return dsu_dict, tdf
示例#15
0
    def corr(sdf_) -> pd.DataFrame:
        """Calculate correlation of data

        :param sdf_: pyspark dataframe
        :return: Correlation matrix in a pamdas dataframe
        """
        col_names = sdf_.columns

        features = sdf_.rdd.map(lambda row: row[0:])
        corr_mat = Statistics.corr(features, method="pearson")
        corr_df = pd.DataFrame(corr_mat)
        corr_df.index, corr_df.columns = col_names, col_names
        return corr_df
示例#16
0
def CorrelationFeature(vectors):

    #	print 'Calculation Correlation'

    matriz = sc.broadcast(Statistics.corr(vectors, method="pearson"))

    summary = Statistics.colStats(vectors)

    varianza = summary.variance()

    #########new heuristic diogo proposal
    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = varianza[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()],
               reverse=True)  #features sorted

    #print r

    #	print 'calculating features selections'

    #Old heuristic
    # # w={}
    # # for i in range(len(matriz)):
    # # 	w[i]=0
    # # 	for j in np.nan_to_num(matriz[i]):
    # # 		k=abs(j)
    # # 		w[i]=w[i]+k

    # r=sorted([(value,key) for (key,value) in w.items()],reverse=True)

    #####""
    #vectors=np.matrix(vectors)
    #beforeMatrix=vectors.map(lambda x: np.matrix(x))

    index = []
    for i in r:
        index.append(i[1])

    index = index[0:6]  #tacking the first 6 features

    #MatrixReducer(vectors,index)
    return index
示例#17
0
def _compute_corr_matrix(spark_df, corr_method='pearson'):
    """
    A helper function for computing a correlation matrix of a spark dataframe (works only with numeric columns).
    The correlation matrix represents the pair correlation of all the variables. By default the method will use
    Pearson correlation (a measure of the linear correlation between two variables X and Y,
    it has a value between +1 and -1, where 1 is total positive linear correlation,
    0 is no linear correlation, and -1 is total negative linear correlation).

    The correlation matrix is computed with Spark.

    Args:
        :spark_df: the spark dataframe to compute the correlation matrix for
        :method: the correlation method, defaults to pearson (spearman supported as well)

    Returns:
        a pandas dataframe with the correlation matrix

    Raises:
        :ValueError: when the provided dataframe is of a structure that can't be used for computing correlations.
    """
    numeric_columns = spark_df.dtypes
    if (len(numeric_columns) == 0):
        raise ValueError(
            "The provided spark dataframe does not contain any numeric columns. "
            "Cannot compute feature correlation on categorical columns. The numeric datatypes are: {}"
            " and the number of numeric datatypes in the dataframe is: {} ({})"
            .format(constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES,
                    len(spark_df.dtypes), spark_df.dtypes))
    if (len(numeric_columns) == 1):
        raise ValueError(
            "The provided spark dataframe only contains one numeric column. "
            "Cannot compute feature correlation on just one column. The numeric datatypes are: {}"
            "and the number of numeric datatypes in the dataframe is: {} ({})".
            format(constants.SPARK_CONFIG.SPARK_NUMERIC_TYPES,
                   len(spark_df.dtypes), spark_df.dtypes))
    if (len(numeric_columns) >
            constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS):
        raise ValueError("The provided dataframe contains  {} columns, "
                         "feature correlation can only be computed for "
                         "dataframes with < {} columns due to scalability "
                         "reasons (number of correlatons grows "
                         "quadratically with the number of columns)" \
                         .format(len(numeric_columns), constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS))
    spark_df_rdd = spark_df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(spark_df_rdd, method=corr_method)
    pd_df_corr_mat = pd.DataFrame(corr_mat,
                                  columns=spark_df.columns,
                                  index=spark_df.columns)
    return pd_df_corr_mat
示例#18
0
def correlation(df, target_col):
    # drop string columns
    columns_to_drop = [
        item[0] for item in df.dtypes if item[1].startswith('string')
    ]
    df_numeric = df.drop(*columns_to_drop)

    # generate correlation matrix
    features = df_numeric.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(features, method="pearson")
    corr_df = pd.DataFrame(corr_mat)
    corr_df.index, corr_df.columns = df_numeric.columns, df_numeric.columns
    corr_df = corr_df[target_col]

    return corr_df
示例#19
0
    def _run_all_in_master_memory(self, method):
        """
        Run the spark pearson correlation by loading all the TS content (ie. values) in master memory

        Each coefficient will be computed by a worker (Spark decides the best choice to apply)
        """

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Get TS content
        rdd_content = self._get_ts(spark_context)

        # Job distribution is made by Statistics.corr (Spark correlation matrix calculation)
        self.results = Statistics.corr(rdd_content, method=method)

        ScManager.stop()
示例#20
0
def create_or_update_week(influencer_tweets, topic_tweets, week):

    topic_cor = []
    influencer_cor = []
    for t in topic_tweets:
        for i in influencer_tweets:
            if t['time'] == i['time']:
                topic_cor.append(t['count'])
                influencer_cor.append(i['count'])

    if len(topic_cor)<=1:
        corr = 0
    else:

        sc = SparkContext(appName="CorrelationPerWeek")

        topic_tweets = sc.parallelize(topic_cor)
        influencer_tweets = sc.parallelize(influencer_cor)

        corr = Statistics.corr(topic_tweets, influencer_tweets, "pearson")

        sc.stop()

    url = "http://localhost:8000/api/weeks/"

    today = datetime.fromtimestamp(week/1000.0)
    payload = '{    "score": %f,    "start_date": "%s"  }' % (
        float(corr), str(today.year) + "-" + str(today.month) + "-" + str(today.day))
    headers = {
        'authorization': "Basic ZGV2OjEyMzQ=",
        'content-type': "application/json",
        'cache-control': "no-cache",
        'postman-token': "7c8668c0-a4c2-f42d-66a9-95cbfb7806c5"
    }

    try:
        response = requests.request("POST", url, data=payload, headers=headers)
        return  response.json()['id']
    except:
        print "error"

    return 1
示例#21
0
def CorrelationFeature(vectors, schema):

    print("Calculating Correlation")

    vectors_rdd = vectors.rdd.map(
        lambda row: Vectors.dense([x for x in row["features"]]))

    matriz = spark.sparkContext.broadcast(
        Statistics.corr(vectors_rdd, method="pearson"))

    summary = Statistics.colStats(vectors_rdd)

    variance = summary.variance()

    ######## Heurística ########

    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = variance[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()], reverse=True)

    index = r[0:6]

    a = []

    for i in index:
        a.append((0, int(i[1])))

    red = MatrixReducer(vectors_rdd, a, schema)

    return red
示例#22
0
def get_language_correlation():
    """
        calculates the correlation between github languages
    """
    #Create Spark Context
    sc = SparkContext(appName="LanguageCorrelations")

    #Create SQL Context
    sqlCtx = SQLContext(sc)

    #Create a schemaRDD from json datasets stored in HDFS
    pushes = sqlCtx.jsonFile('git_14_15/git_results')

    #Register the schemaRDD as a Table
    pushes.registerTempTable('pushes')

    #filter the data to get the pushes for the languages from LANG
    filtered = sqlCtx.sql('select * from pushes where repository_language in ' + str(tuple(LANG)))

    #perform map transformation to get the rdd in the format (actor, {lang : pushes})
    f_pair = filtered.map(lambda s: (s.actor, {s.repository_language:s.pushes}))

    #group the RDD's based on actor to get the RDD of the format (actor, [{lang1 : pushes},{lang2 : pushes}...])
    f_group = f_pair.groupByKey()

    #merge lang dictionries to get single orderd dict per actor
    f_merged = f_group.map(lambda s: merge_lang_dict(s[1]))

    #created rdd of vectors from the pushes values, which is required for the correlation algorithm
    vectors = f_merged.map(lambda s: Vectors.dense(map(float, s.values())))  
    
    #call the correlation function
    matrix = Statistics.corr(vectors)
    print matrix
    plot_graph(matrix)
    sc.stop()
示例#23
0
# Matrice de corrélation
# print(df.corr())


# ### Mllib Statistics

# In[5]:

from pyspark.mllib.stat import Statistics
# Basics Statistics
partsNum = parts.map(lambda line: line[0:8])
summary = Statistics.colStats(partsNum)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
Statistics.corr(partsNum, method="pearson")


# # Classification supervisée

# ## Naive Bayes

# In[6]:

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
import utils_mesure
nomF_svm = "glass_svm"
data = sc.textFile("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF_svm+".csv")

# suppression du header
nomColInit = data.first()
 def pearson(self):
     return Statistics.corr(self.a, self.b, 'pearson')
示例#25
0
# each vector represents a year, with values corrresponding to crimes for each beat
# years are the "rows", and beats are "columns"
crimesVectors = crimesByYear3.map(lambda x: Vectors.dense(x[1]))

crimes3.unpersist()

####  Correlation ##########################################################

# If a single RDD of Vectors is passed in, a correlation 
# matrix comparing the columns in the input RDD is returned.

# If you want to explore your data it is best to compute both, since 
# the relation between the Spearman (S) and Pearson (P) correlations will give some information. Briefly, 
# S is computed on ranks and so depicts monotonic relationships while P is on true values and depicts linear relationships.
# http://stats.stackexchange.com/questions/8071/how-to-choose-between-pearson-and-spearman-correlation
pearsonCorr = Statistics.corr(crimesVectors)
spearmanCorr = Statistics.corr(crimesVectors, method="spearman")
print pearsonCorr
print spearmanCorr
type(pearsonCorr)

# Check dimension should be #beats, #beats
pearsonCorr.shape

# create correlation dictionary function
def createCorrDic(corr):
	"""
	Key: (i,j), Value: value 
	So (i,j) represents to index for beats, value the correlation value between them

	@ param: correlation matrix
import numpy as np

from pyspark import SparkContext
# $example on$
from pyspark.mllib.stat import Statistics
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="CorrelationsExample")  # SparkContext

    # $example on$
    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
    # seriesY must have the same number of partitions and cardinality as seriesX
    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))

    data = sc.parallelize(
        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
    )  # an RDD of Vectors

    # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print(Statistics.corr(data, method="pearson"))
    # $example off$

    sc.stop()
示例#27
0
from pyspark.sql import HiveContext
from pyspark.mllib.stat import Statistics
from pyspark import SparkContext

sc = SparkContext()

sqlContext = HiveContext(sc)

initialquery = sqlContext.sql("SELECT         A.avg_procedure_score,         B.patientsurveyscore FROM         (SELECT                 p.hospitalid,                 avg(p.score) as avg_procedure_score         FROM                 procedures p         GROUP BY                 p.hospitalid) A JOIN         survey_results B ON B.hospitalid = A.hospitalid")

survey_score = initialquery.map(lambda x: x.patientsurveyscore)
avg_procedure_scores = initialquery.map(lambda x: x.avg_procedure_score)

print Statistics.corr(avg_procedure_scores, survey_score, method="pearson")
示例#28
0
    np.array([2.0, 20.0, 200.0]),
    np.array([3.0, 30.0, 300.0])
])

summary = Statistics.colStats(mat)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

## correlation
# vectors
seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])
seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 55.0])

print('Pearson correlation is: {}'.format(
    Statistics.corr(seriesX, seriesY, method='pearson')))
print('Spearman correlation is: {}'.format(
    Statistics.corr(seriesX, seriesY, method='spearman')))

# matrix
print('Correlation of matrix: {}'.format(Statistics.corr(mat,
                                                         method='pearson')))

## sampling
# sampling methods can be performed on RDD's of key-value pairs
data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'),
                       (3, 'f')])

fractions = {1: 0.1, 2: 0.6, 3: 0.3}
approxSample = data.sampleByKey(False, fractions)
示例#29
0
#Remove the first line
header=rddUSD.first()
dataLines = rddUSD.filter(lambda x: x != header)
dataLines.take(5)


usdVectors = dataLines.map(transformationLR.transformToNumeric)

#Perform statistical Analysis

usdStats=Statistics.colStats(usdVectors)
usdStats.mean()
usdStats.variance()
usdStats.min()
usdStats.max()
Statistics.corr(usdVectors)
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

    
usdLP = usdVectors.map(transformationLR.transformToLabeledPoint)
usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"])
usdDF.select("label", "features").show(10)

#Split into training and testing data
(trainingData, testData) = usdDF.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()

#Build the model on training data
lr = LinearRegression(maxIter=10)
# Vectorise the data and drop na's.
features_vector = assembler.transform(
    features_subset.na.drop()).select(vector_col)

# Get correlation matrix
corr_matrix = Correlation.corr(features_vector, vector_col)

# Output the matrix.
corr_matrix.collect()[0]["pearson({})".format(vector_col)].values

# Other correlation implementation.
from pyspark.mllib.stat import Statistics
col_names = features_subset.columns
features = features_subset.na.drop().rdd.map(lambda row: row[0:])
corr_mat = Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names
print(corr_df.to_string())

# For a correlation plot of all variables in the features dataset, see the Python script "Corr_plot.ipynb".

# For the first 5 columns, we see that Area_Method_of_Moments_Overall_Standard_Deviation_4 is highly
# correlated with Area_Method_of_Moments_Overall_Standard_Deviation_5 with a correlation coefficient of
# 0.946 (3dp), and Area_Method_of_Moments_Overall_Standard_Deviation_2 is highly correlated with
# Area_Method_of_Moments_Overall_Standard_Deviation_4 with a correlation coefficient of 0.849 (3dp).

##########################

# Define the schema of the MAGD dataset.
MAGD_schema = StructType([
示例#31
0
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
sc = SparkContext("local", "sample")
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(data, ["features"])

r1 = Statistics.corr(df, method="features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Statistics.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))
示例#32
0
summary = Statistics.colStats(mat)

print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
print(summary.max())
print(summary.min())
print(summary.count())
print(summary.normL1())
print(summary.normL2())

#correlation
x = sc.parallelize(np.random.randn(4, 1))
y = sc.parallelize(np.random.randn(4, 1))
print("Correlation :", str(Statistics.corr(x, y)))

#Chi-square
#For Vector
x = Vectors.dense(np.random.random_sample((5)))
y = Vectors.dense(np.random.random_sample((5)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)

# For Matrices
x = Matrices.dense(4, 2, np.random.random_sample((8)))
y = Matrices.dense(4, 2, np.random.random_sample((8)))
chisqr = Statistics.chiSqTest(x, y)
 def rho(self):
     return Statistics.corr(self.a, self.b, 'spearman')
示例#34
0
withstations = tidy.rdd.map(lambda row: Row(station=map_yx_to_station(row.yx),
                                            datehour=row.datehour)).toDF()

withstations.registerTempTable('stationincidents')
incidentcount = sqlc.sql(
    "select station, datehour, count(1) as incidents from stationincidents group by station, datehour"
)

print("we now have incidents by station/hour in incidentcount")
incidentcount.show(10)

# now join the two tables
joined = cleanedaverages.join(incidentcount, ['station', 'datehour'], 'outer')

# if incident data doesn't exist for that station/datehour, then it is 0
zeroed = joined.rdd.map(lambda row: Row(station=row.station,
                                        datehour=row.datehour,
                                        temp=row.temp,
                                        wind=row.wind,
                                        incidents=row.incidents
                                        if row.incidents else 0)).toDF()

# if temp/wind data doesn't exist for that station/datehour, then we can't use that row
final = zeroed.filter(zeroed.temp.isNotNull()).filter(
    zeroed.wind.isNotNull()).filter(zeroed.temp != 0)

# finally apply correlation test
vecs = final.rdd.map(
    lambda row: Vectors.dense([row.temp, row.wind, row.incidents]))
print(Statistics.corr(vecs))
from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(rideRDD)
print "Duration\tMorning\tAfternoon\tEvening\tWeekday\tMale\tAge\n"
print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.mean())
print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.variance())
print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.numNonzeros())


# #### 1(c) Determine correlation of Age with Duration

# In[3]:

durationRDD = rideRDD.map(lambda x : x[0]) # Extract duration from the RDD
ageRDD = rideRDD.map(lambda x : x[6]) # Extract Age from the RDD
print(Statistics.corr(durationRDD, ageRDD, method="pearson")) # Print the Pearson correlation of Age vs. Duration


# ### ** Part 2: Linear Regression **

# #### ** (2a) Plotting **
# 
# 

# In[4]:

# Plot Age Vs. Duration
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
plt.scatter(ageRDD.collect(), durationRDD.collect(),alpha=0.5)
plt.xlabel="Age"
示例#36
0
[[37.7816834,-122.3887657],\
[37.7469112,-122.4821759],\
[37.7411022,-120.804151],\
[37.4834543,-122.3187302],\
[37.7576436,-122.3916382],\
[37.7970013,-122.4140409],\
[37.748496,-122.4567461],\
[37.7288155,-122.4210133],\
[37.5839487,-121.9499339],\
[37.7157156,-122.4145311],\
[37.7329613,-122.5051491],\
[37.7575891,-122.3923824],\
[37.7521169,-122.4497687]])),
["SF18", "SF04", "SF15", "SF17", "SF36", "SF37",\
"SF07", "SF11", "SF12", "SF14", "SF16", "SF19", "SF34"] ),d,h))

located = located.map(lambda (l,d,h): ((d,h,l),1))

located = located.reduceByKey(lambda a, b : a + b)

joined = located.join(reducedTuple)

print joined.first()

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

vecs = joined.map(lambda ((d,h,s),(c,(w,t))): Vectors.dense([t,w,c]))

print(Statistics.corr(vecs))
示例#37
0
    return year_beat_crimes

years_crimes = filtered_more.flatMap(lambda x: (collapse_years(x)))
#print (years_crimes.collect())
# [(2001, 1584), (2002, 1327), (2003, 1272), (2004, 1164), (2005, 1089), 
# (2006, 1090), (2007, 1313), (2008, 1167), (2009, 1207), (2010, 1132), 
# (2011, 1028), (2012, 1430), (2013, 1625), (2014, 1616), (2015, 526), 
# (2001, 1720), (2002, 1679), (2003, 1691), (2004, 1412), (2005, 1172), 
# (2006, 1169), (2007, 1260), (2008, 1541), (2009, 1583), (2010, 1432), 
# (2011, 1327), (2012, 1124), (2013, 942), (2014, 891), (2015, 339), 

years_crimes_grouped = years_crimes.groupByKey()
for_vectors = years_crimes_grouped.mapValues(lambda x: sorted(list(x), key=lambda y: y))

crime_vectors = for_vectors.map(lambda x: Vectors.dense([x[1]])) 
beat_correlations = Statistics.corr(crime_vectors,method="pearson")
# print (np.shape(beat_correlations))
# produces a 254x254 array of correlations

def get_max_corr(x):
    max_corr = 0
    for (i,j), values in np.ndenumerate(x):
        if x[i][j] > max_corr and x[i][j] < 1 and abs(i-j) == 1:
            max_corr = x[i][j]
    return max_corr

max_correlation = get_max_corr(beat_correlations)
max_indices = np.where(beat_correlations==max_correlation)

#print (max_correlation)
#print (max_indices)
示例#38
0
header = raw_data.first()
raw_data = raw_data.filter(lambda x: x != header)


def parse_interaction(line):
    #split lines based on the delimeter, and create a list
    line_split = line.split(",")
    #replace NA with zeros
    line_split = [w.replace('NA', '0') for w in line_split]
    #remove year, and other non-numeric data
    """
	0 = Year
	11 = ActualElapsedTime
	12 = CRSElapsedTime
	13 = AirTime
	16 = Distance 
	"""
    symbolic_indexes = [0, 8, 10, 11, 12, 13, 16, 17, 18, 22]
    clean_line_split = [
        item for i, item in enumerate(line_split) if i not in symbolic_indexes
    ]
    return np.array([float(x) for x in clean_line_split])


vector_data = raw_data.map(parse_interaction)

#start timer at this point
startTime = datetime.now()
print(Statistics.corr(vector_data, method="pearson"))
print('Time consumed = '), (datetime.now() - startTime)
示例#39
0
    merged_final = merged.reduceByKey(lambda x,y : int(x) + int(y))

    #sort by month-year
    # Map each year to all beats and their corresponding crime counts for that year, and sort the counts 
    # by beat
    groupedbeatCountsbymonthyear = merged_final.map( lambda row: ( row[ 0 ][ 1 ], ( row[ 0 ][ 0 ], row[ 1 ] ) ) ) \
                                   .groupByKey( ) \
                                   .mapValues( lambda val: sorted( list( val ), key = lambda t: t[ 0 ] ) );
    # Create a list of all beats
    groupbeats = [ elem[ 0 ] for elem in groupedbeatCountsbymonthyear.values( ).first( ) ];
    
    beatvectorCounts = groupedbeatCountsbymonthyear.values( ) \
                                .map( lambda row: Vectors.dense( [ elem[ 1 ] for elem in row ] ) );
    
    # Compute correlation between all beats for yearly crime counts
    corrMatrix = Statistics.corr( beatvectorCounts, method = 'pearson' );
    
     # Fill the diagonal of correlation matrix with 0's
    corrMatrix.flags[ 'WRITEABLE' ] = True;
    np.fill_diagonal( corrMatrix, 0.0 );

    # Get the 10 largest correlation values from the matrixr The correlation matrix is symmetric so
    # we take the largest 20 and step by 2. Finally, the index of the corresponding beat pairs for
    # top 10 correlation values is obtained.
    sortOrder = corrMatrix.argsort( axis = None );
    indices = np.unravel_index( sortOrder[ -20::2 ], corrMatrix.shape  );

    # The corresponding beats names are obtained for the top 10 correlated beat pairs
    topBeatPairs = [ ( groupbeats[ i ], groupbeats[ j ] ) for i, j in zip( indices[ 0 ], indices[ 1 ] ) ];

    for i, j in topBeatPairs:
df_care_max_scores = df_care.groupBy('measure_id').max().collect()
df_care_max_scores = sc.broadcast(df_care_max_scores)

# function to extract max_score for each measure_id
def get_max_score(id):
    return [score[1] for score in df_care_max_scores.value if score[0] == id][0]

# creating a new RDD containing extra column for normalized score
# that is ratio of current score with maximum score for the measure_id
df_care = df_care.map(lambda p: Row(
                       provider_id = p[0],
					   measure_id = p[1], 
					   score = p[2], 
					   normalized_score = float(p[2]) / get_max_score(p[1])
					   )
					   )

# creating dataframe from the RDD
df_care = sqlCtx.createDataFrame(df_care)

# get total normalized score per hospital
df_care = df_care.groupBy('provider_id').sum('normalized_score').map(lambda p: Row( provider_id = p[0], care_score = p[1]))
df_care = sqlCtx.createDataFrame(df_care)

# merge df_care and df_survey table
df = df_care.join(df_survey, df_care.provider_id == df_survey.provider_id).select(df_survey.provider_id,df_survey.survey_score, df_care.care_score)
survey_care_rdd = df.map(lambda p : (p.survey_score, p.care_score))
correlation = Statistics.corr(survey_care_rdd)
print 'correlation between hospital quality score and survey responses : %.3f'% (correlation[0][1])

示例#41
0
from datetime import datetime

sc = SparkContext(appName= "Run 1 - Corr-Wide - Data95-08 - AWS")

data_file = "s3://aws-logs-012060642840-us-west-2/elasticmapreduce/cloud_proj/95-08.csv"
raw_data = sc.textFile (data_file).cache ()
#extract header
header = raw_data.first () 
raw_data = raw_data.filter (lambda x:x != header)

def parse_interaction(line):
	#split lines based on the delimeter, and create a list
	line_split = line.split (",")
	#replace NA with zeros
	line_split = [w.replace ('NA', '0') for w in line_split]
	#remove year, and other non-numeric data
	"""
	0 = Year
	"""
	symbolic_indexes = [0, 8, 10, 16, 17, 22]
	clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes]
	return np.array ([float (x) for x in clean_line_split])

vector_data = raw_data.map (parse_interaction)

#start timer at this point
startTime = datetime.now()
print (Statistics.corr (vector_data, method="pearson"))
print ('Time consumed = '), (datetime.now() - startTime)
sc.stop()
beatYrComboCount = file1.map(lambda x: ( (x[10], int(x[17])) , 1)).reduceByKey(lambda x,y: x + y).cache()

# List of beat-yr combos
beatYr = beatYrComboCount.keys().flatMap(lambda x: [(x[0], y) for y in range(2001,2016)])

# Find missing beats and union with non-missing to create full set
missing = beatYr.subtract(beatYrComboCount.keys()).distinct()
allCrimeCnts = beatYrComboCount.union(missing.map(lambda x: (x,0)))

file_cnts = allCrimeCnts.map(lambda x: (x[0][1], (x[0][0], x[1]))).groupByKey().mapValues(lambda x: sorted(list(x),key = lambda x: x[0])).cache()

# List of beats
beats = [element[0] for element in file_cnts.values().first()]
vectorCnts = file_cnts.values().map(lambda x: Vectors.dense([ element[1] for element in x]))

cor = Statistics.corr(vectorCnts, method='pearson')


cor.flags['WRITEABLE'] = True;
np.fill_diagonal(cor, 0.0)

# Get top 10 correlation values from matrix
sorted = cor.argsort(axis=None)
ind = np.unravel_index(sorted[-20::2], cor.shape)

mostCorrBeatPairs = [(beats[i], beats[j]) for i,j in zip(ind[0], ind[1])]

for i,j in mostCorrBeatPairs:
	print i,j

示例#43
0
cord = orders.select("order_id", "user_id")

ordpr = order_products.select("order_id", "product_id")
f = ordpr.join(cord, "order_id")
more = cord.groupby("user_id").count()
larger = f.groupby("user_id").count()

more = more.sort(asc("user_id"))
more.show()
mo = more.select("count")
mo = mo.collect()

larger = larger.sort(asc("user_id"))
la = larger.select("count")
la = la.collect()

#A = more.join(larger, more.user_id == larger.user_id)
#a =more.select("count")
#a = more.collect()
#a = np.array(a)
#b = larger.select("count")
#b = more.collect()
#b = np.array(b)
#

from pyspark.mllib.stat import Statistics

r1 = Statistics.corr(sc.parallelize(mo), sc.parallelize(la),
                     method="pearson").head()
print("Correlation\n" + str(r1))
__author__ = 'Michael'


from pyspark.mllib.stat import Statistics

api_mentions = sc.parallelize([ 23, 35, 56])
mpetyx_tweets = sc.parallelize([ 45, 67, 76])

corr = Statistics.corr(api_mentions, mpetyx_tweets, "pearson")
示例#45
0
def readRankMatrix():
    import numpy as np
    lines = sc.textFile('../yelp_trans.csv')
    rawData = lines.mapPartitionsWithIndex(removeHeader)
    mydata = rawData.map(removeColumns).cache()
    return mydata


from pyspark.mllib.stat import Statistics
from pandas import Series
import pandas as pd
import numpy as np
import math

mydata = readRankMatrix()
corr = Statistics.corr(mydata)

# set up the columns names and add a new names called user_id
lines2 = sc.textFile('../yelp.csv')
names = lines2.map(lambda line:line.split(",")).map(lambda a:a[0]).collect()[1:]

s = Series([str for str in names])
pddata = pd.DataFrame(corr, columns=s)
pddata['user_id'] = names
df_corr = sqlContext.createDataFrame(pddata)
# df_corr.cache()
df_corr.registerTempTable("corr")

def getTopReviewUsers(n):
    # n: the nth highest user
    ord_user = sqlContext.sql("select user_id, count(review_id) as count from reviews_json group by user_id order by count desc")
示例#46
0
if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: correlations (<file>)", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonCorrelations")
    if len(sys.argv) == 2:
        filepath = sys.argv[1]
    else:
        filepath = 'data/mllib/sample_linear_regression_data.txt'
    corrType = 'pearson'

    points = MLUtils.loadLibSVMFile(sc, filepath)\
        .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))

    print()
    print('Summary of data file: ' + filepath)
    print('%d data points' % points.count())

    # Statistics (correlations)
    print()
    print('Correlation (%s) between label and each feature' % corrType)
    print('Feature\tCorrelation')
    numFeatures = points.take(1)[0].features.size
    labelRDD = points.map(lambda lp: lp.label)
    for i in range(numFeatures):
        featureRDD = points.map(lambda lp: lp.features[i])
        corr = Statistics.corr(labelRDD, featureRDD, corrType)
        print('%d\t%g' % (i, corr))
    print()

    sc.stop()
示例#47
0
        out.append(int(l[i]) - int(l[i - 1]))
    return out


if __name__ == "__main__":
    cov = sys.argv[1]
    senti = sys.argv[2]
    #output = sys.argv[2]

    sc = SparkContext("local", "death_trend")

    covid = sc.textFile(cov).map(lambda line: parseLine(line).split(
        ",")).filter(lambda line: line[-1].isdigit()).map(
            lambda line: ["global", dailyIncrease(line[60::])]).reduceByKey(
                combineSeries).collect()
    covid = sc.parallelize(covid[0][1])
    maximum = covid.max()
    covid = covid.map(lambda each: float(each) / float(maximum))
    covid.saveAsTextFile("covid")

    sent = sc.textFile(senti).map(lambda line: line.split(" ")).map(
        lambda line: [int(line[0][14:-4]), float(line[3])]).collect(
        )  #.saveAsTextFile("senti_processed")
    sorted_senti = sorted(sent)
    sorted_senti = sc.parallelize(sorted_senti).map(lambda x: x[1])
    max_senti = sorted_senti.max()
    sorted_senti = sorted_senti.map(lambda x: float(x) / float(max_senti))
    sorted_senti.saveAsTextFile("sorted_senti")
    #print(sorted_senti)
    print(Statistics.corr(covid, sorted_senti))
示例#48
0
def dist_corr(v1, v2):
    """
    Function to compute correlation between two Spark RDDs
    """

    return Statistics.corr(v1,v2)
示例#49
0
if __name__ == "__main__":

    #Simple Correlation
	
	sc = pyspark.SparkContext()
	csv = np.genfromtxt('C:\\SBU\\Fall2017\\BigData\\Project\\BDProject\\final_test_selfHarm.csv', delimiter=",",skip_header=3)

    second = csv[:, 10].astype(float)
    print(second[:3])
    third = csv[:, 17].astype(float)
    print(third[:3])
    
	a = []
    b = []

    for i in range(len(second)):
        if (math.isnan(second[i]) or math.isnan(third[i])):
            continue
        else:
            a.append(second[i])
            b.append(third[i])

    a = np.asarray(a)
    b = np.asarray(b)
    seriesX = sc.parallelize(a)
    seriesY = sc.parallelize(b)
    corrval = Statistics.corr(seriesX, seriesY, method="pearson")
    
	print(corrval)
示例#50
0
"""

Testing with Correlation
https://spark.apache.org/docs/latest/mllib-statistics.html

"""

from pyspark.mllib.stat import Statistics
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector, Vectors


sc = SparkContext("local", "Rubbish")

seriesX = sc.parallelize([1.0, 2.0, -2.0], 2)
seriesY = sc.parallelize([3.0, 4.0, 5.0], 2)
corrXY =  Statistics.corr(seriesX, seriesY, method="pearson")

# RDD of Vectors
data = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
                       Vectors.dense([4, 5, 0,  3]),
                       Vectors.dense([6, 7, 0,  8])])

print "Correlation between x & y: ", corrXY
print "Correlation matrix: ", data
bundle_pearson_dict = {} #dictionary to hold the bundle as key and the coeff as value

for bundle_name in actual_bundle_list:
    final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = \""+bundle_name+"\"")
    food_metric_only= final_table_by_bundle.map(lambda p:  p.zip_AGI_foodmetric[2])
    food_metric_list = food_metric_only.collect()
    weighted_AGI_only= final_table_by_bundle.map(lambda p:  p.zip_AGI_foodmetric[1])
    weighted_AGI_list = weighted_AGI_only.collect()
    if not food_metric_list and not weighted_AGI_list:
        print 'pass'
    else:
        
        x=sc.parallelize(weighted_AGI_list,2)
        y=sc.parallelize(food_metric_list,2)
        
        correlation_coeff =  Statistics.corr(x,y, method="pearson") # -0.128161962745 or is it -0.0965926041863??
        bundle_pearson_dict[bundle_name]= correlation_coeff
    
        
bundle_pearson_dict  #to get all coeff values by bundle

# In[53]:

#Here I have an example scatter plot for bundle_name = 'vegetables' to have an idea of how the plot looks
# x is the AGI for every zip code
# y is the food metric
#an example plot is also available to be viewed in the parent folder

final_table_by_bundle = sqlContext.sql("select * from final_table_sorted where bundle = 'vegetables'")
food_metric_only= final_table_by_bundle.map(lambda p:  p.zip_AGI_foodmetric[2])
food_metric_list = food_metric_only.collect()
示例#52
0
#keep only Cyl,Displacement
autoVectors=dataLines.map(transformToNumeric)		
autoVectors.collect()

#perform analysis
from pyspark.mllib.stat import Statistics
from pyspark.sql import SQLContext(sc)


						
autostats=Statistics.colStats(autoVectors)
autosatats.mean()
autosatats.Variance()						
autosatats.min()
autosatats.max()
Statistics.corr(autoVector)


from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

def transformToLabelPoint(inStr):
lp =(float(inStr[0]),vectors.dense(inStr[1]))
return lp


autolp = autovectors.map(transformToLabledPoint)
autodf = sqlContext.createDataFrame(autoLp,["label","features")
autodf.select("label","features").show(10)

#SPLIT 
示例#53
0
# $example on$
from pyspark.mllib.stat import Statistics

# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="CorrelationsExample")  # SparkContext

    # $example on$
    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
    # seriesY must have the same number of partitions and cardinality as seriesX
    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print("Correlation is: " +
          str(Statistics.corr(seriesX, seriesY, method="pearson")))

    data = sc.parallelize([
        np.array([1.0, 10.0, 100.0]),
        np.array([2.0, 20.0, 200.0]),
        np.array([5.0, 33.0, 366.0])
    ])  # an RDD of Vectors

    # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
    # If a method is not specified, Pearson's method will be used by default.
    print(Statistics.corr(data, method="pearson"))
    # $example off$

    sc.stop()
示例#54
0
dataLines = rddUSD.filter(lambda x: x != header)
dataLines.count()
dataLines.first()
dataLines.take(5)

#RDD to Dense vector
vectorsUSD = dataLines.map(transformationDT.transformToNumeric)
vectorsUSD.take(5)

#Perform statistical Analysis
statsUSD=Statistics.colStats(vectorsUSD)
statsUSD.mean()
statsUSD.variance()
statsUSD.min()
statsUSD.max()
Statistics.corr(vectorsUSD)

#SPARK SQL
dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",")
dataframe.registerTempTable("dataUSDuprv")
dff1=sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show()
dataframe.show()


#LabeledPoint
lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
lpUSD.take(5)
dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"])
dfUSD.select("label", "features").show(10)

#String Indexer