예제 #1
0
def main():

    sc = initializeSpark()

    spark = SparkSession(sc)

    directory, post_id = parse_pls()
    rdds = make_rdds_from_dir(directory, sc)
    post_rdd = rdds["posts_rdd"]

    string = make_stripped_string(post_rdd, post_id)

    print("\n Body from post_id: " + str(post_id) +
          ", stripped of shitespaces and special characters:\n")
    print("'" + string + "'\n")

    # Tokenize the string
    tokens = tokenize(string)
    # remove duplicate entries
    tokens_unique = remove_dupes(tokens)

    # Assign id to the unique tokens
    token_id_tuple = assign_id_to_list(tokens_unique)
    # Now assign these id's the the original token list
    token_id_all = assign_unique_ids(token_id_tuple, tokens)

    print("\nTokens retrieved from the body with their respective id's: \n")
    for i in token_id_all:
        print(i)

    print("\n\nEdges:\n")
    ids = []
    for i in token_id_all:
        ids.append(i[0])

    # Create edges on a window size of 5, using the ids of the tokens
    edges = create_edges(ids, 5)
    # Removes duplicate edges from list
    edges = remove_dupe_tuples(edges)
    print(edges)
    print("\n\nPageRank:")

    sqlContext = SQLContext(sc)

    v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"])

    e = sqlContext.createDataFrame(edges, ["src", "dst"])

    g = graphframes.GraphFrame(v, e)

    results = g.pageRank(resetProbability=0.15, tol=0.0001)
    results.vertices.select("word", "pagerank").show(truncate=False)
def Compute_Average(rinput, output):

    ## ratings data
    ### Craeting ratings-rdd
    my_RDD_strings = sc.textFile(rinput + '/' + 'ratings.csv')
    data = my_RDD_strings.map(lambda line: readline(line))

    ## Extraacting header row

    header_info = data.first()

    ### Selects all the rows except the header row
    data_mr = data.filter(lambda ratings: ratings != header_info)

    data_mr = data_mr.map(lambda ratings: string_to_float(ratings))
    data_mr_sum_count = data_mr.aggregateByKey(
        (0, 0), lambda U, s: (U[0] + s, U[1] + 1), lambda U, V:
        (U[0] + V[0], U[1] + V[1]))
    ### format (tag(sum,count))
    avg_ratings = data_mr_sum_count.map(lambda (a, (b, c)): (a, float(b) / c))
    ### here a=movie,b=sum,c=count

    sorted_avg_ratings = avg_ratings.sortByKey(
    )  ## sorting by movieID in accending order

    ## Creating output csv file based on the dataset-folder

    if 'ml-20m' in rinput:
        result_csv = 'Prashanth_Manja_result_task1_big.csv'
    else:
        result_csv = 'Prashanth_Manja_result_task1_small.csv'

    sqlContext = SQLContext(sc)
    data_frame = sqlContext.createDataFrame(sorted_avg_ratings)
    panda_data_frame = data_frame.toPandas()

    ## Output as csv file
    panda_data_frame.to_csv(output + '/' + result_csv,
                            encoding='utf-8',
                            header=['movieID', 'rating_avg'],
                            index=False)
예제 #3
0
class MainApp(object):
    def __init__(self):
        pass
    
    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local[10]")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)        

    def loadData(self):
        category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3])))
        category_schema = StructType([
            StructField("userid", IntegerType(), True),
            StructField("movieid", IntegerType(), True),
            StructField("rating", FloatType(), True),
            StructField("time", LongType(), True)
        ])
        category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list.registerTempTable("data")
        movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid")
        movie_list.registerTempTable("movie")
        movieid = movie_list.sort(movie_list.ct.desc()).first().movieid
        # movieid = category_list.first().movieid
        category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid))
        category_list.registerTempTable("data")
        user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50")
        print(user_list.count())
        user_list.show()
        user_list.registerTempTable("users")
        category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1)
        #category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time))
        category_list = category_list.repartition(1)
        category_list.saveAsTextFile("data.txt")
예제 #4
0
    sprk = Spark_Session()
    conn = sprk.Spark_Context()
    sql_conn = sprk.Spark_Connect()
    sqlContext = SQLContext(conn)

dept_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Dept.txt")
emp_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Emp.txt")

emp = emp_rdd.map(lambda x: x.split(",")).map(lambda k: Row(empID=int(k[0]),
                                                            name=k[1].strip(),
                                                            Age=k[3].strip(),
                                                            Sal=float(k[4]),
                                                            DeptID=int(k[5])))
dept = dept_rdd.map(lambda x: x.split(",")).map(
    lambda k: Row(DeptID=int(k[0]), State=k[2].strip()))

EmpDF = sqlContext.createDataFrame(emp)
DeptDF = sqlContext.createDataFrame(dept)
EmpDF.show()
DeptDF.show()

#emp1.join(dept1,emp1.Deptid == dept1.deptid,"inner").select(emp1.age,dept1.deptid,emp1.Sal).show(2)

EmpDF.withColumn('DEPARTMENTNO', EmpDF.DeptID).withColumn('SALARY',
                                                          EmpDF.Sal).show()

EmpDF.select(
    EmpDF.Age,
    to_date(from_unixtime(unix_timestamp(
        EmpDF.Age, 'dd-MM-yyyy'))).alias("ModAge")).show()
예제 #5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging

from pyspark import SparkContext, SparkConf
from pyspark.sql.context import SQLContext
from pyspark.sql.types import Row

__author__ = 'guotengfei'
__time__ = 2019 / 11 / 26
"""
Module comment
"""

LOGGER = logging.getLogger(__name__)

if __name__ == '__main__':
    conf = SparkConf().setAppName('vcom')
    sc = SparkContext(conf=conf)
    sql = SQLContext(sc)

    lines = sc.textFile('../data/users.txt')
    user = lines.map(lambda l: l.split(",")).map(lambda p: (p[0], p[1]))

    sql.createDataFrame(user, ['id', 'name']).registerTempTable('user')
    df = sql.sql("select id,name from user")

    df.write.save("../data/result", format='json')
예제 #6
0
KnownRow = Row('node', 'source', 'distance')

schema = StructType([
    StructField('node', StringType(), False),
    StructField('source', StringType(), False),
    StructField('distance', IntegerType(), False),
])

graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter(
    lambda x: x is not None).flatMap(lambda x: x).coalesce(1)
graphedges = graphedges_rdd.toDF(['source', 'destination']).cache()
graphedges.registerTempTable('SourceDestTable')

initial_node = source_node
initial_row = KnownRow(initial_node, initial_node, 0)
knownpaths = sqlContext.createDataFrame([initial_row], schema=schema)
part_knownpaths = knownpaths

for i in range(6):
    part_knownpaths.registerTempTable('PartKnownPathTable')

    newpaths = sqlContext.sql("""
    SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM
    SourceDestTable t1
    JOIN
    PartKnownPathTable t2
    ON (t1.source = t2.node)
    """).show()

    newpaths.registerTempTable('NewPathTable')
    knownpaths.registerTempTable('KnowPathTable')
예제 #7
0
import pyspark.sql.functions as fn
import pyspark.sql.types as typ

if __name__ == "__main__":

    # initialize the spark
    conf = SparkConf().setMaster("local[*]").setAppName("Test4_1")

    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # create DataFrame duplicates
    df = sqlContext.createDataFrame(
        [(1, 144.5, 5.9, 33, 'M'), (2, 167.2, 5.4, 45, 'M'),
         (3, 124.1, 5.2, 23, 'F'), (4, 144.5, 5.9, 33, 'M'),
         (5, 124.1, 5.2, 54, 'F'), (3, 124.1, 5.2, 23, 'F'),
         (5, 129.2, 5.3, 42, 'M')],
        ['id', 'weight', 'height', 'age', 'gender'])

    print('Count of rows: {0}'.format(df.count()))
    print('Count of distinct rows: {0}'.format(df.distinct().count()))

    # duplicated
    df_drop = df.dropDuplicates()

    print('Count of ids: {0}'.format(df_drop.count()))
    print('Count of distinct ids: {0}'.format(
        df.select([c for c in df_drop.columns
                   if c != 'id']).distinct().count()))

    df_drop2 = df_drop.dropDuplicates(
예제 #8
0
conf = SparkConf().setAppName("load logs")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
assert sc.version >= '1.5.1'

text = sc.textFile(inputs1)+sc.textFile(inputs2)

def parseline(line):
    linere = re.compile('^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$')
    match = re.search(linere, line)
    if match:
        m = re.match(linere, line)
        host = m.group(1)
        dt = datetime.datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S')
        path = m.group(3)
        bys = float(m.group(4))
        dct = {"host": host, "datetime": dt, "path": path, "bys": bys}
        return dct
    return None

rdd = text.map(lambda line: parseline(line)).filter(lambda x: x is not None)
df = sqlContext.createDataFrame(rdd).coalesce(1)
df.write.format('parquet').save(output, mode='overwrite')






    rel['name'] = x[0]
    rel['age'] = x[1]
    return rel


peopleRDD = sc.textFile(
    "file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt"
)

# 定义一个模式字符串
schemaString = "name age"

# 根据模式字符串生成模式
fields = list(
    map(lambda fieldName: StructField(fieldName, StringType(), nullable=True),
        schemaString.split(" ")))
schema = StructType(fields)
# 从上面信息可以看出,schema描述了模式信息,模式中包含name和age两个字段

rowRDD = peopleRDD.map(lambda line: line.split(',')).map(
    lambda attributes: Row(attributes[0], attributes[1]))

peopleDF = spark.createDataFrame(rowRDD, schema)

# 必须注册为临时表才能供下面查询使用
peopleDF.createOrReplaceTempView("people")

results = spark.sql("SELECT * FROM people")
results.rdd.map(lambda attributes: "name: " + attributes[0] + "," + "age:" +
                attributes[1]).foreach(print)
예제 #10
0
df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')

# Correlation is a normalized measure of covariance that is easier to understand, 
# as it provides quantitative measurements of the statistical dependence between two random variables.
df.stat.corr('rand1', 'rand2')
df.stat.corr('id', 'id')

# 4. Cross-tabulation is a powerful tool in statistics that is used to 
# observe the statistical significance (or independence) of variables.

# Create a DataFrame with two columns (name, item)
names = ["Alice", "Bob", "Mike"]
items = ["milk", "bread", "butter", "apples", "oranges"]
df = sqlcontext.createDataFrame([(names[i % 3], items[i % 5]) for i in range(100)], ["name", "item"])

df.show()
df.stat.crosstab("name", "item").show()



예제 #11
0
output_schema = StructType(schema.fields + [StructField(prediction_field, prediction_type, nullable=True)])

if ascontext:
    if ascontext.isComputeDataModelOnly():
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)

if ascontext:
    s_model = ascontext.getModelContentToString("model")
    s_metadata = ascontext.getModelContentToString("model.metadata")
else:
    s_model = open(modelpath,"r").read()
    s_metadata = open(modelmetadata_path,"r").read()

df = df.toPandas()
model_metadata = json.loads(s_metadata)
import pickle
clf = pickle.loads(s_model)

predictors = model_metadata['predictors']
missing_predictors = [predictor for predictor in predictors if predictor not in df.columns]
if len(missing_predictors):
    raise Exception("Following fields are required by the model for scoring: "+str(missing_predictors))

df[prediction_field] = clf.predict(df[predictors])

df = sqlCtx.createDataFrame(df)

if ascontext:
    ascontext.setSparkOutputData(df)
예제 #12
0
textinput = sc.textFile(inputs + 'links-simple-sorted.txt')
graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).\
      filter(lambda x: x is not None).flatMap(lambda x: x)
graphedges = graphedges_rdd.toDF(['source', 'destination']).cache()
print('constructed edge graph df from given input')
graphedges.show()
KnownRow = Row('node', 'source', 'distance')

schema = StructType([
    StructField('node', StringType(), False),
    StructField('source', StringType(), False),
    StructField('distance', IntegerType(), False),
])

newRow = KnownRow(source_node, source_node, 0)
finalOut = sqlContext.createDataFrame([newRow], schema=schema).cache()

inter_df = finalOut
# inter_df.show()
for i in range(6):

    print('loop : ', i)

    if (inter_df.filter(inter_df.node == dest_node).count() > 0):
        print('match found')
        break

    # graphedges.join(inter_df.source)
    cond = [inter_df['node'] == graphedges['source']]
    df_result = graphedges.join(inter_df, cond, 'inner').\
      select(graphedges['destination'].alias('node'), graphedges['source'],\
from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter
from pyspark.sql.window import Window, WindowSpec

conf = SparkConf().setMaster("local").setAppName("MyApp")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)


def stopSpark():
    sc.stop()


def f(x):
    rel = {}
    rel['name'] = x[0]
    rel['age'] = x[1]
    return rel

people = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\
    .map(lambda line : line.split(','))\
    .map(lambda p: Row(name=p[0], age=int(p[1])))
peopleDF = spark.createDataFrame(people)

# peopleDF = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\
#     .map(lambda line : line.split(','))\
#     .map(lambda x: Row(**f(x))).toDF()

peopleDF.createOrReplaceTempView("people")  # 必须注册为临时表才能供下面的查询使用
personsDF = spark.sql("select name,age from people where age>20")
personsDF.rdd.map(lambda t: "Name:" + t[0] + "," + "Age:" + str(t[1])).foreach(
    print)
예제 #14
0
    swimmersJSON.printSchema

    # schema reflection
    stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'),
                                   (234, 'Michael', 22, 'green'),
                                   (345, 'Simone', 23, 'blue')])

    schema = StructType([
        StructField("id", LongType, True),
        StructField("name", StringType, True),
        StructField("age", LongType, True),
        StructField("eyeColor", StringType, True)
    ])

    swimmers = sqlContext.createDataFrame(stringCSVRDD, schema)

    swimmers.registerTempTable("swimmers")

    swimmers.printSchema

    # API
    swimmers.count

    swimmers.select("id", "age").filter("age = 22").show

    swimmers.select(swimmers.id, swimmers.age).filter(swimmers.age == 22).show

    swimmers.select("name", "eyeColor").filter("eyeColor like 'b%'").show

    swimmers.sql("select count(1) from swimmers").show
예제 #15
0
from pyspark.sql import Row
from pyspark.sql import SparkSession

if __name__ == '__main__':
    spark = SparkSession.builder.appName(
        "Python Spark SQL basic example").master("local[*]").config(
            "spark.some.config.option"
            "some-value").getOrCreate()

    sc = spark
    conn = sc.sparkContext
    sqlContext = SQLContext(conn)

    dept_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Dept.txt")
    emp_rdd = conn.textFile("/Users/shuvamoymondal/Downloads/Emp.txt")
    emp = emp_rdd.map(lambda x: x.split(",")).map(lambda k: k)
    dept = dept_rdd.map(lambda x: x.split(",")).map(lambda k: k)
    dt1 = emp_rdd.map(lambda j: j.split(",")).map(
        lambda k: Row(empID=k[0],
                      Name=k[1].strip(),
                      Design=k[2].strip(),
                      Age=k[3],
                      Sal=k[4],
                      DeptID=int(k[5])))

    dt2 = dept_rdd.map(lambda j: j.split(",")).map(
        lambda k: Row(DeptID=int(k[0]), Job=k[1].strip(), State=k[2].strip()))

    EmpDF = sqlContext.createDataFrame(dt1)
    DeptDF = sqlContext.createDataFrame(dt2)
예제 #16
0
dmt = DataModelTools(datamodel)

predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel, predictors, df)

from pyspark.mllib.clustering import KMeansModel
model = KMeansModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df, predictors, setToFlag=1.0)


def rowToList(row):
    result = []
    for idx in range(0, len(row)):
        result.append(row[idx])
    return result


mapFn = lambda (x, y): rowToList(x) + [y]

rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])

# finally convert the RDD containing the list of values for the resulting rows, back to a dataframe
outdf = sqlCtx.createDataFrame(rdd2, output_schema)

if ascontext:
    ascontext.setSparkOutputData(outdf)
else:
    print(outdf.take(100))
predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.clustering import KMeansModel
model = KMeansModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0)

def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result


mapFn = lambda (x,y):rowToList(x)+[y]

rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])

# finally convert the RDD containing the list of values for the resulting rows, back to a dataframe
outdf = sqlCtx.createDataFrame(rdd2,output_schema)

if ascontext:
    ascontext.setSparkOutputData(outdf)
else:
    print(outdf.take(100))



예제 #18
0
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ[
            "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("My application")
        conf.set("spark.executor.memory", "2g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)
        self.df_user = self.sqlContext.read.json("dataset/user.json").cache()
        self.df_review = self.sqlContext.read.json(
            "dataset/review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "dataset/business.json").cache()
        self.df_user.registerTempTable("user")

    def getS3File(self, s3FilePath, destinationPathOnLocal):
        r = requests.get(s3FilePath)
        fileOb = open(destinationPathOnLocal, 'w')
        fileOb.write(r.text)
        fileOb.close()

    def writeToS3File(self, s3FilePath, sourcePathOnLocal):
        fileOb = open(sourcePathOnLocal, 'r')
        payload = fileOb.read()
        fileOb.close()

        headers = {"x-amz-acl": "public-read-write"}
        return requests.put(s3FilePath, headers=headers, data=payload)

    def reads3spark(self, path):
        # path = "s3n://b-datasets/flight_data/*"
        x = self.sc.textFile(path)  # we can just specify all the files.
        return x

    def writes3spark(self, x, path):
        x.saveAsTextFile(path)

    def createFeatures(self):
        userData = self.sqlContext.sql(
            "SELECT user_id, name, review_count, votes, fans, yelping_since, elite FROM user"
        )
        userData = userData.map(mapUsers).coalesce(1)
        res = self.sqlContext.createDataFrame(userData)

        review_user = self.df_review.select(self.df_review.business_id,
                                            self.df_review.user_id)
        business_loc = self.df_business.select(self.df_business.business_id,
                                               self.df_business.city,
                                               self.df_business.state)
        df_join_reviewAndBusiness = review_user.join(
            business_loc,
            review_user.business_id == business_loc.business_id).select(
                "user_id", "city", "state")
        df_grouped = df_join_reviewAndBusiness.groupBy(
            ["user_id", "city", "state"]).count()
        df_panda = res.toPandas()
        for name, group in df_grouped:
            if (group['city'] > 10):
                user_id = df_grouped.get_group(name)[0]['user_id']
                df_panda[user_id]['k'] = df_panda[user_id]['k'] + 1

        res = self.sqlContext.createDataFrame(df_panda)
        res.toJSON().saveAsTextFile('user_features.json')
예제 #19
0
@author: weyu
'''

from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
import sys, pandas, json
from docplex.mp.model import Model
from collections import namedtuple
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType

ascontext = None
if len(sys.argv) > 1 and sys.argv[1] == "-test":
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    data = sqlContext.createDataFrame(
        pandas.read_csv(
            "C:\\Users\\weyu\\Documents\\awork_spss\\cplex\\wheat_or_barley_en.csv"
        ))
    limitationInput = "wheat_or_barley_limitation.py"
    input_name = "name"
    input_qmin = "qmin"
    input_qmax = "qmax"
    input_optimise = "Price"
else:
    import spss.pyspark.runtime
    ascontext = spss.pyspark.runtime.getContext()
    sc = ascontext.getSparkContext()
    sqlContext = ascontext.getSparkSQLContext()
    data = ascontext.getSparkInputData()
    limitationInput = "%%input_file_nutrients%%"
    input_name = "%%input_col_name%%"
    input_qmin = "%%input_col_qmin%%"
예제 #20
0
        user_id       = match.group(3),
        date          = (match.group(4)[:-6]).split(":", 1)[0],
        time          = (match.group(4)[:-6]).split(":", 1)[1],
        method        = match.group(5),
        endpoint      = match.group(6),
        protocol      = match.group(7),
        response_code = int(match.group(8)),
        content_size  = int(match.group(9))
    )

# .cache() - Persists the RDD in memory, which will be re-used again
access_logs = (sc.textFile(logFile)
               .map(parse_apache_log_line)
               .cache())

schema_access_logs = sqlContext.createDataFrame(access_logs)
#Creates a table on which SQL like queries can be fired for analysis
schema_access_logs.registerTempTable("logs")

endpointsSearch = (sqlContext
                .sql("SELECT * FROM logs WHERE endpoint=" + argv[1])
                .rdd.map(lambda row: (row[0], row[1]))
                .collect())


# def mappingFunc(s):
#     words = s.split(" ")
#     return len(words)

# base_df = spark.read.text(raw_data_files)
# base_df.printSchema()
예제 #21
0
KnownRow = Row('node', 'source', 'distance')

schema = StructType([
StructField('node', StringType(), False),
StructField('source', StringType(), False),
StructField('distance', IntegerType(), False),
])

graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter(lambda x: x is not None).flatMap(lambda x: x).coalesce(1)
graphedges = graphedges_rdd.toDF(['source', 'destination']).cache()
graphedges.registerTempTable('SourceDestTable')

initial_node = source_node
initial_row = KnownRow(initial_node, initial_node, 0)
knownpaths = sqlContext.createDataFrame([initial_row], schema=schema)
part_knownpaths = knownpaths

for i in range(6):
    part_knownpaths.registerTempTable('PartKnownPathTable')

    newpaths = sqlContext.sql("""
    SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM
    SourceDestTable t1
    JOIN
    PartKnownPathTable t2
    ON (t1.source = t2.node)
    """)

    newpaths.registerTempTable('NewPathTable')
    knownpaths.registerTempTable('KnowPathTable')