示例#1
0
 def test_convert_row_to_dict(self):
     row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
     self.assertEqual(1, row.asDict()['l'][0].a)
     df = self.sc.parallelize([row]).toDF()
     df.registerTempTable("test")
     row = self.sqlCtx.sql("select l, d from test").head()
     self.assertEqual(1, row.asDict()["l"][0].a)
     self.assertEqual(1.0, row.asDict()['d']['key'].c)
示例#2
0
    def test_convert_row_to_dict(self):
        row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
        self.assertEqual(1, row.asDict()['l'][0].a)
        df = self.sc.parallelize([row]).toDF()

        with self.tempView("test"):
            df.createOrReplaceTempView("test")
            row = self.spark.sql("select l, d from test").head()
            self.assertEqual(1, row.asDict()["l"][0].a)
            self.assertEqual(1.0, row.asDict()['d']['key'].c)
示例#3
0
 def test_column_select(self):
     df = self.df
     self.assertEqual(self.testData, df.select("*").collect())
     self.assertEqual(self.testData, df.select(df.key, df.value).collect())
     self.assertEqual([Row(value='1')],
                      df.where(df.key == 1).select(df.value).collect())
示例#4
0
 def parse_line_to_SparkSQLRow(self, line):
     from pyspark.sql import Row
     return Row(**self.parse_line_to_dict(line))
示例#5
0
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])
pen_raw = sc.textFile("first-edition/ch08/penbased.dat", 4).map(lambda x:  x.split(", ")).map(lambda row: [float(x) for x in row])

dfpen = sqlContext.createDataFrame(pen_raw.map(Row.fromSeq(_)), penschema)
def parseRow(row):
    d = {("pix"+str(i)): row[i-1] for i in range(1,17)}
    d.update({"label": row[16]})
    return d

dfpen = sqlContext.createDataFrame(pen_raw.map(parseRow), penschema)
va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1])
penlpoints = va.transform(dfpen).select("features", "label")

pensets = penlpoints.randomSplit([0.8, 0.2])
pentrain = pensets[0].cache()
penvalid = pensets[1].cache()

penlr = LogisticRegression(regParam=0.01)
示例#6
0
    ds = json.loads(v[1])
    return ((ds['shape'], ds['color']), [ds['size']])


db_prop = {'user': '******', 'password': '******'}


def writeToDB(v):
    if not v.isEmpty():
        try:
            v.toDF().write.jdbc(url='jdbc:postgresql://localhost:5432/mydb',
                                table='logs',
                                mode='append',
                                properties=db_prop)
        except Exception as e:
            print(e)
            print("bad, bad")


parsed = kafkaStream.map(parseJson)
parsed = parsed.reduceByKey(lambda x, y: x + y).map(
    lambda x: (x[0][0], (x[0][1], len(x[1]), np.percentile(x[1], 10))))
parsed.reduceByKey(lambda x, y: x if x[1] > y[1] else y).map(
    lambda v: Row(currentTime=datetime.datetime.now(),
                  shape=v[0],
                  mostPopularColor=v[1][0],
                  percentile=float(v[1][2]))).foreachRDD(writeToDB)

ssc.start()
ssc.awaitTermination()
示例#7
0

def get_graphedges(line):
    list1 = line.split(':')
    if list1[1] == '':
        return None
    list2 = list1[1].split(' ')
    list2 = filter(None, list2)
    results = []
    s = list1[0]
    for d in list2:
        results.append((s, d))
    return results


KnownRow = Row('node', 'source', 'distance')

schema = StructType([
    StructField('node', StringType(), False),
    StructField('source', StringType(), False),
    StructField('distance', IntegerType(), False),
])

graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter(
    lambda x: x is not None).flatMap(lambda x: x).coalesce(1)
graphedges = graphedges_rdd.toDF(['source', 'destination']).cache()
graphedges.registerTempTable('SourceDestTable')

initial_node = source_node
initial_row = KnownRow(initial_node, initial_node, 0)
knownpaths = sqlContext.createDataFrame([initial_row], schema=schema)
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, clean and extract the articles of each to each page, and save them to HDFS, with some metadata associated with each article.
    
    Metadata collected:  "title",  "edition", "year", "place", "archive_filename",  "source_text_filename", "text_unit", 
    "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition",
    "num_articles", "num_page_words", "num_article_words",   

    Data is saved as Dataframes into HDFS. 

    Example:
    'Encyclopaedia Britannica: or, A dictionary of arts and sciences':
     - archive_name: /home/tdm/datasets/eb_test/144850366
       articles:
       ACQUEST:
        - or Acquist, in law, signifies goods got by purchase or donation. See CoNtiUEST.
       ACQUI:
         - "a town of Italy, in the Dutchy of Montferrat, with a biihop\u2019s see, and\
          \ commodious baths. It was taken by the Spaniards in 1745, and retaken by the\
          \ Piedmontese in 1746; but after this, it was taken again and difrcantled by\
          \ the French, who afterwards forsook it. It is seated on the river Bormio, 25\
          \ miles N.W. of Genoa, and 30 S. of Cafal, 8. 30. E. long. 44. 40. lat."
       ACQUIESCENCE:
         - in commerce, is the consent that a person gives to the determination given either
           by arbitration, orbyaconful

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    text_unit = "page"
    # [(tittle, edition, year, place, archive filename,
    #   num pages, type of archive, type of disribution, model)]
    documents = archives.flatMap(
        lambda archive: [(document.title, document.edition, document.year, \
                          document.place, document.archive.filename, document.num_pages, \
                           document.document_type, document.model, document) for document in list(archive)])

    # [(tittle, edition, year, place, archive filename, page filename, text_unit, tex_unit_id, num_pages,
    #   type of archive, type of disribution, model, page_type, header, articles_page_dictionary, num_articles_page, num_page_words)]
    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                               year_document[3], year_document[4], page.code, text_unit, page.page_id, \
                               year_document[5], year_document[6], year_document[7], \
                               filter_terms_page(page, defoe_path, os_type), len(page.words)) for page in year_document[8]])

    # [(tittle, edition, year, place, archive filename, page filename , text_unit, tex_unit_id, num_pages,
    #   type of archive, type of disribution, model, page_type, header, term, definition, num_articles_per_page, num_page_words, num_artciles_words)]

    pages_articles = pages_clean.flatMap(
        lambda articles_page: [(articles_page[0], articles_page[1], articles_page[2],\
                               articles_page[3], articles_page[4], articles_page[5], articles_page[6], articles_page[7], \
                               articles_page[8], articles_page[9], articles_page[10], \
                               articles_page[11][0], articles_page[11][1], key, articles_page[11][2][key], articles_page[11][3],\
                               articles_page[12], len(articles_page[11][2][key].split(" "))) for key in articles_page[11][2]])

    #[Encyclopaedia Britannica; or, A dictionary of arts and sciences, compiled upon a new plan, First edition, 1771, Volume 1, A-B, 1771, Edinburgh, /lustre/home/sc048/rosaf4/datasets/nls-data-encyclopaediaBritannica/144133901, alto/188083401.34.xml, page, Page53, 832, book, nlsArticles, Articles, AFFAFR, AFFIANCE, in law, denotes the mutual plighting of troth between a man and a woman to marry each, 32, 887, 17]

    nlsRow = Row("title", "edition", "year", "place", "archive_filename",
                 "source_text_filename", "text_unit", "text_unit_id",
                 "num_text_unit", "type_archive", "model", "type_page",
                 "header", "term", "definition", "num_articles",
                 "num_page_words", "num_article_words")
    sqlContext = SQLContext(context)
    df = sqlContext.createDataFrame(pages_articles, nlsRow)
    df.write.mode('overwrite').option("header",
                                      "true").csv("eb_total_articles.csv")
    return "0"
示例#9
0
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

# Create a SparkSession
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/tmp").appName("BestMovies").getOrCreate()

# Load up our movie ID -> name dictionary
nameDict = loadMovieNames()

#Load the data
lines = spark.sparkContext.textFile("file:///SparkCourse/ml-100k/u.data")
#Convert to RDD with rows as objects
movies = lines.map(lambda x: Row(movieID =int(x.split()[1]),rating=x.split()[2]))

#Convert RDD above to a dataframe
movieDataset = spark.createDataFrame(movies)

#Group by movieID. Average ratings for each movie and count ratings per movie
bestMovieIDs = movieDataset.groupBy("movieID").agg(avg("rating").alias("avgRating"), count("movieID").alias("Ratings")).filter(col("Ratings")>=100)

orderedMovieIDs=bestMovieIDs.orderBy("avgRating", ascending=True).collect()

# Print the results
print('{:<40}{:>10}{:>10}'.format("Movie","Average","Ratings"))
for result in orderedMovieIDs:
    # Each row has movieID, count as above.
    print('{:<40}{:>10}{:>10}'.format(nameDict[result[0]][0:40], round(result[1],3), result[2]))
示例#10
0
if __name__ == '__main__':
    print("......... hello main method ......")
    # 使用反射推断schema
    conf = SparkConf().setAppName("1st sql in spark")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    hc = HiveContext(sc)
    # 创建一个数据集合,用于模拟数据源
    datas = ["1 zhangfei 44", "2 guanyu 55", "3 zilong 60"]
    # 对datas执行parallelize操作,将其转化为spark rdd source 数据类型为字符串
    source = sc.parallelize(datas)
    # c. 将spark rdd source中的每一条数据进行切片(split)后转换为spark rdd rows,数据类型为row;
    # 至此spark rdd rows已经具备转换为schemardd的条件:它的数据类型为row。
    splits = source.map(lambda line: line.split(" "))
    rows = splits.map(
        lambda words: Row(id=words[0], name=words[1], age=words[2]))
    # d. 使用HiveContext推断rows的schema,将其转换为schemardd people;
    people = hc.inferSchema(rows)
    # 通过people.printSchema(),我们可以查看推断schema的结果:
    people.printSchema()
    print("the first print end now ......")
    # e. 将schemardd people注册为一张临时表“people”;
    people.registerTempTable("people")
    #执行查询语句 select * from people where age>50 and age < 60 并将查询结果保存至spark rdd results,通过results.printSchema()的输出结果:
    res = hc.sql("select * from people where age>50 and age<60")
    res.printSchema()
    print(".......................................................... 萌萌的")
    # schemardd results2的数据类型为row,受到查询语句(select name)的影响,其仅包含一列数据,列名为name。
    res1 = hc.sql("select name from people")
    res1.printSchema()
    print(".......................................................... 萌萌的")
示例#11
0
def rebuild_microbatch(rdd, spark_conf):
    global_config = getConfig()
    try:
        encounters = rdd.collect()

        if len(encounters) > 0:
            start_time = datetime.datetime.utcnow()
            print("\n --- Micro-Batch --- \n")
            print("Building encounter objects " + time.ctime())

            rows = []
            encounter_ids = set()
            location_ids = set()
            visit_ids = set()
            patient_ids = set()
            form_ids = set()

            for encounter in encounters:
                ## filters
                encounter_ids.add(encounter['encounter_id'])
                location_ids.add(encounter['location_id'])
                visit_ids.add(encounter['visit_id'])
                patient_ids.add(encounter['patient_id'])
                form_ids.add(encounter['form_id'])

                encounter_object = Row(**encounter)
                rows.append(encounter_object)

            spark = get_spark_instance(spark_conf)

            obs_query = '(select * from obs where encounter_id in ({0})) foo'.format(
                (", ".join(["%d"] * len(encounter_ids))) %
                tuple(encounter_ids))

            obs = spark.read.format('jdbc').option('url', 'jdbc:mysql://mysql2:3306/' + 'amrs' + '?zeroDateTimeBehavior=convertToNull')\
                .option('useUnicode', 'true')\
                .option('continueBatchOnError', 'true').option('useSSL','false')\
                .option('user', global_config['mysql']['user'])\
                .option('password', global_config['mysql']['password'])\
                .option('dbtable', obs_query)\
                .load()

            encounter_df = spark.createDataFrame(
                rows, get_encounter_schema()).withColumnRenamed(
                    'encounter_datetime',
                    'encounter_unixtime').withColumnRenamed(
                        'date_created', 'unixtime_created').withColumnRenamed(
                            'date_voided',
                            'unixtime_voided').withColumnRenamed(
                                'date_changed',
                                'unixtime_changed').withColumnRenamed(
                                    'voided', 'voided_int')

            encounter_df_fixed_schema = encounter_df.withColumn(
                'encounter_datetime',
                f.to_timestamp(
                    f.from_unixtime(f.col("encounter_unixtime") / 1000))
            ).withColumn(
                'date_created',
                f.to_timestamp(
                    f.from_unixtime(f.col("unixtime_created") / 1000))
            ).withColumn(
                'date_voided',
                f.to_timestamp(f.from_unixtime(
                    f.col("unixtime_voided") / 1000))).withColumn(
                        'date_changed',
                        f.to_timestamp(
                            f.from_unixtime(
                                f.col("unixtime_changed") / 1000))).withColumn(
                                    'voided',
                                    f.when(f.col('voided_int') == 0,
                                           False).otherwise(True)).drop(
                                               'encounter_unixtime',
                                               'unixtime_created',
                                               'unixtime_voided',
                                               'unixtime_changed',
                                               'voided_int').alias('encounter')

            filters = {
                'encounter_ids': {
                    'column': 'encounter_id',
                    'values':
                    [0 if x is None else x for x in list(encounter_ids)]
                },
                'visit_ids': {
                    'column': 'visit_id',
                    'values': [0 if x is None else x for x in list(visit_ids)]
                },
                'form_ids': {
                    'column': 'form_id',
                    'values': [0 if x is None else x for x in list(form_ids)]
                },
                'location_ids': {
                    'column': 'location_id',
                    'values':
                    [0 if x is None else x for x in list(location_ids)]
                },
                'patient_ids': {
                    'column': 'patient_id',
                    'values':
                    [0 if x is None else x for x in list(patient_ids)]
                }
            }
            transformed_obs = transform_obs(obs)
            transformed_encounter = transform_encounter(
                encounter_df_fixed_schema, transformed_obs, True,
                filters).cache()
            save_to_cassandra(transformed_encounter, 'encounter')
            trigger_couch_update_jobs(location_ids, transformed_encounter)
            transformed_encounter.unpersist()

    except:
        print("An unexpected error occured")
        raise
示例#12
0
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


# Create a SparkSession (the config bit is only for Windows!)
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

# Load up our movie ID -> name dictionary
nameDict = loadMovieNames()

# Get the raw data
lines = spark.sparkContext.textFile("./ml-100k/u.data")
# Convert it to a RDD of Row objects
movies = lines.map(lambda x: Row(movieID=int(x.split()[1])))
# Convert that to a DataFrame
movieDataset = spark.createDataFrame(movies)

# Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = movieDataset.groupBy("movieID").count().orderBy(
    "count", ascending=False).cache()

# Show the results at this point:

#|movieID|count|
#+-------+-----+
#|     50|  584|
#|    258|  509|
#|    100|  508|
示例#13
0
def combine_text(x):
    return Row(title=x.title, body=x.body, article=f"{x.title} {x.body}")
def test_top_n(spark):

    data_list = []

    for i in range(500):
        data_list.append({"col_1": "robin", "col_2": ["smith", "jones"]})

    for i in range(200):
        data_list.append({"col_1": "john", "col_2": ["jones"]})

    for i in range(300):
        data_list.append({
            "col_1":
            uuid4().hex[:10],
            "col_2": [uuid4().hex[:10],
                      uuid4().hex[:10],
                      uuid4().hex[:10]],
        })

    df = spark.createDataFrame(Row(**x) for x in data_list)
    df.createOrReplaceTempView("df")

    df_acvf = _generate_df_all_column_value_frequencies(["col_1", "col_2"], df,
                                                        spark)
    df_acvf.createOrReplaceTempView("df_acvf")
    df_acvf = df_acvf.persist()

    df_perc = _get_df_percentiles(df_acvf, spark)
    df_top_n = _get_df_top_bottom_n(df_acvf, spark, 20)

    percentiles_collected = _collect_and_group_percentiles_df(df_perc)
    top_n_collected = _collect_and_group_top_values(df_top_n)

    percentiles = percentiles_collected["col_1"]
    top_n = top_n_collected["col_1"]

    assert top_n[0]["value_count"] == 500
    assert top_n[0]["value"] == "robin"
    assert top_n[1]["value_count"] == 200
    assert top_n[1]["value"] == "john"

    assert percentiles[0]["percentile_ex_nulls"] == 1.0
    assert percentiles[0]["value_count"] == 500

    assert percentiles[1]["value_count"] == 500
    assert percentiles[2]["value_count"] == 200
    assert percentiles[-1]["value_count"] == 1

    percentiles = percentiles_collected["col_2"]
    top_n = top_n_collected["col_2"]

    assert top_n[0]["value_count"] == 500
    assert top_n[0]["value"] == "smith, jones"

    df_acvf = _generate_df_all_column_value_frequencies_array(["col_2"], df,
                                                              spark)
    df_acvf.createOrReplaceTempView("df_acvf")
    df_acvf = df_acvf.persist()

    df_top_n = _get_df_top_bottom_n(df_acvf, spark, 20)

    top_n = _collect_and_group_top_values(df_top_n)["col_2"]

    assert top_n[0]["value_count"] == 700
    assert top_n[0]["value"] == "jones"
def parseCSV(idx, part):
    if idx==0:
        part.next()
    for p in csv.reader(part):
        yield Row(ORIGIN=p[14],
                  DEP_DEL15 = p[33])
示例#16
0
from pyspark.sql import Row

if __name__ == "__main__":
    #cite: http://spark.apache.org/docs/latest/sql-programming-guide.html#tab_python_0
    spark = SparkSession \
        .builder \
        .appName("sql_top10_business") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    # Load a text file and convert each line to a Row.
    sc = spark.sparkContext
    review_rdd = sc.textFile("review.csv").map(lambda r: r.split("::"))
    business_rdd = sc.textFile("business.csv").map(lambda b: b.split("::"))

    reviewMap_rdd = review_rdd.map(
        lambda x: Row(business_id=x[2], user_id=x[1]))

    # Infer the schema, and register the DataFrame as a table.
    schema_review=spark.createDataFrame(reviewMap_rdd) \
                       .distinct()\
                       .groupBy('business_id')\
                       .count()
    #select first 10 business_id
    top10_schema = schema_review.sort("count", ascending=False)\
                         .head(10)
    #?????
    top10_schema = spark.createDataFrame(sc.parallelize(top10_schema))

    businessMap_rdd = business_rdd.map(
        lambda x: Row(business_id=x[0], full_address=x[1], categories=x[2]))
    # Infer the schema, and register the DataFrame as a table.
示例#17
0
from pyspark.sql import Row

datas1 = [("foo", 1), ("bar", 2)]
datas2 = [
    Row(name='Alice', age=5, height=80),
    Row(name='Alice', age=5, height=80),
    Row(name='Alice', age=10, height=80)
]

# Spark Context를 이용하는 방법
sc.parallelize(datas1).toDF().show()
sc.parallelize(datas2).toDF().show()

# SparkSession을 이용하는 방법
spark.createDataFrame(datas1).show()
spark.createDataFrame(datas2).show()
示例#18
0
    tempList.append(tweetLongitude)
    tempList.append(tweetLatitude)
    tempList.append(tweetText)
    tempList.append(reply)
    tempList.append(replyText)
    tempList.append(tweetResturant)
    tempList.append(tweetUrl)
    tempList.append(tweetLocation)
    return tempList

lines = sc.read.text(path).rdd.map(lambda x: x[0])\
          .map(lambda x: removeComma(x))\
          .map(lambda x: getText(x))\
          .filter(lambda x: not(x == None))\
          .filter(lambda x: checkEmpty(x[0]))\
          .map(lambda x: Row(id=x[0], user=x[1], timeStamp=x[2], geo=x[3], longitude=x[4], latitude=x[5], text=x[6], reply=x[7], replyText=x[8], resturant=x[9], url=x[10], userLocation=x[11]))

df = sc.createDataFrame(lines)

tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Extract the features
hashing_tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
lines = Pipeline(stages=[tokenizer, hashing_tf, idf])

# Get the data to test
line_fit = lines.fit(df)
test_model = line_fit.transform(df)

# Load the trained model
示例#19
0
文件: feature.py 项目: ggupta81/spark
    """
    Model fitted by :py:class:`RFormula`.
    """


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.feature tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    testData = sc.parallelize([
        Row(id=0, label="a"),
        Row(id=1, label="b"),
        Row(id=2, label="c"),
        Row(id=3, label="a"),
        Row(id=4, label="a"),
        Row(id=5, label="c")
    ], 2)
    globs['stringIndDf'] = sqlContext.createDataFrame(testData)
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
示例#20
0
#  Row(language='ta', count=2),
#  Row(language='et', count=3),
#  Row(language='zh', count=13),
#  Row(language='', count=16),
#  Row(language='se', count=21),
#  Row(language='fr', count=50),
#  Row(language='ja', count=6),
#  Row(language='id', count=3),
#  Row(language='la', count=6),
#  Row(language='da', count=4),
#  Row(language='fi', count=5),
#  Row(language='he', count=4)
# ]

language_counts = [
    Row(language="en", count=16871),
    Row(language="ru", count=299),
    Row(language="no", count=243),
    Row(language="es", count=80),
    Row(language="pt", count=56),
    Row(language="fr", count=50),
    Row(language="it", count=29),
    Row(language="ro", count=28),
    Row(language="se", count=21),
    Row(language="ms", count=20),
    Row(language="af", count=18),
    Row(language="de", count=17),
    Row(language="", count=16),
    Row(language="zh", count=13),
    Row(language="ku", count=11),
    Row(language="nl", count=11),
示例#21
0
    def to_row(x):
        dict = json.loads(x[1])
        output = {}
        output["meas_flag"] = dict.get("meas_flag", "")
        output["meas_method"] = dict.get("meas_method", "")
        output["company"] = dict.get("company", "")
        output["event"] = dict.get("event", "")
        output["meas_datatype"] = dict.get("meas_datatype", "")
        output["meas_description"] = dict.get("meas_description", "")
        output["meas_name"] = dict.get("meas_name", "")
        output["meas_status"] = dict.get("meas_status", "")
        output["meas_unit"] = dict.get("meas_unit", "")
        output["sensor"] = dict.get("sensor", "")
        output["site"] = dict.get("site", "")
        output["station"] = dict.get("station", "")
        output["ts"] = dict.get("ts", 0)

        # Set meas_value and meas_value_datatype
        output["meas_value_datatype"] = "unknown"
        output["meas_value_str"] = ""
        output["meas_value_d"] = np.nan
        output["meas_value_l"] = np.nan

        if "meas_value" in dict and isinstance(dict['meas_value'], unicode):
            output["meas_value_str"] = dict["meas_value"]
            if output["meas_value_str"] != "":
                output["meas_value_datatype"] = 'string'

        if "meas_value" in dict and isinstance(dict['meas_value'], float):
            if output["meas_datatype"] == "long" and long(
                    float(dict['meas_value'])) == float(dict['meas_value']):
                output["meas_value_l"] = float(dict["meas_value"])
                output["meas_value_datatype"] = 'long'
            else:
                output["meas_value_d"] = float(dict["meas_value"])
                output["meas_value_datatype"] = 'double'

        if "meas_value" in dict and (isinstance(dict['meas_value'], int)
                                     or isinstance(dict['meas_value'], long)):
            output["meas_value_l"] = float(dict["meas_value"])
            output["meas_value_datatype"] = 'long'

        # Set meas_lower_limit
        output["meas_lower_limit_d"] = np.nan
        output["meas_lower_limit_l"] = np.nan

        if "meas_lower_limit" in dict and isinstance(dict['meas_lower_limit'],
                                                     float):
            if long(float(dict["meas_lower_limit"])) == float(
                    dict["meas_lower_limit"]):
                output["meas_lower_limit_l"] = float(dict["meas_lower_limit"])
            else:
                output["meas_lower_limit_d"] = float(dict["meas_lower_limit"])

        if "meas_lower_limit" in dict and (
                isinstance(dict['meas_lower_limit'], int)
                or isinstance(dict['meas_lower_limit'], long)):
            output["meas_lower_limit_l"] = float(dict["meas_lower_limit"])

        # Set meas_upper_limit
        output["meas_upper_limit_d"] = np.nan
        output["meas_upper_limit_l"] = np.nan

        if "meas_upper_limit" in dict and isinstance(dict['meas_upper_limit'],
                                                     float):
            if long(float(dict["meas_upper_limit"])) == float(
                    dict["meas_upper_limit"]):
                output["meas_upper_limit_l"] = float(dict["meas_upper_limit"])
            else:
                output["meas_upper_limit_d"] = float(dict["meas_upper_limit"])

        if "meas_upper_limit" in dict and (
                isinstance(dict['meas_upper_limit'], int)
                or isinstance(dict['meas_upper_limit'], long)):
            output["meas_upper_limit_l"] = float(dict["meas_upper_limit"])

        return Row(**output)
import re
import os

from sklearn import decomposition
import matplotlib.pyplot as plt
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler


# In[63]:
#cold start dataset, cointains over 20 features
pwd = '/Users/RUIest/Desktop/big_data_project/lending_data_clean_v3.csv'
lines = spark.read.text(pwd).rdd
parts = lines.map(lambda row: row.value.split(','))
lend_RDD = parts.map(lambda p: Row(lable=int(p[22]),featuresList=(float(p[0]), float(p[1]),float(p[2]),float(p[3]),float(p[4]),float(p[5]),float(p[6]),
                                                                  float(p[8],float(p[9],float(p[10],float(p[11],float(p[12],float(p[13],float(p[14],float(p[15],float(p[16],
                                                                  float(p[17],float(p[18],float(p[19],float(p[20],float(p[21])))


# In[338]:

# Create a DataFrame
lending_df = spark.createDataFrame(lend_RDD)
lending_df.show(10)


# In[339]:

# Convert feature type to vector
lending_df_vectors = lending_df.rdd.map(lambda row: Row(
    label=row["lable"],
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()

from pyspark.sql import Row
from pyspark.sql.types import StructField, StringType, StructType, LongType

mySchema = StructType([
    StructField("column1", StringType(), True),
    StructField("column2", StringType(), True),
    StructField("column3", LongType(), False)
])
myRow = Row("Bonjour", "French", 1)
myDf = spark.createDataFrame([myRow], mySchema)
myDf.show()
示例#24
0
def _create_row(fields, values):
    row = Row(*values)
    row.__fields__ = fields
    return row
示例#25
0
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName("Authors").getOrCreate())

schema = StructType([
    StructField("Author", StringType(), False),
    StructField("State", StringType(), False)
])

rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, schema)
authors_df.show()
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

sc = SparkContext()
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("thach")\
        .getOrCreate()

    lines = sc.textFile(sys.argv[1])
    parts = lines.map(lambda l: l.split(",", 1))
    parts = parts.map(lambda l: [l[0], l[1].split(",")])
    plantsRDD = parts.map(lambda p: Row(plant=p[0], items=p[1]))

    plantsRDD_result = spark.createDataFrame(plantsRDD)

    plants_withID = plantsRDD_result.orderBy('plant').withColumn(
        "id", monotonically_increasing_id())
    plants_withID.createOrReplaceTempView("plant_states")

    getFrequentItems = plants_withID.select("id", "items")
    fpGrowth = FPGrowth(itemsCol="items",
                        minSupport=float(sys.argv[3]),
                        minConfidence=float(sys.argv[4]))
    model = fpGrowth.fit(getFrequentItems)

    def get_antecedent_length(antecedent):
        return len(antecedent)
    with open("basics/ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

# Create a SparkSession (the config bit is only for Windows!)
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("PopularMovies").getOrCreate()

# Load up our movie ID -> name dictionary
nameDict = loadMovieNames()

# Get the raw data
lines = spark.sparkContext.textFile("file:///Spark-Python/basics/ml-100k/u.data")
# Convert it to a RDD of Row objects
movies = lines.map(lambda x: Row(movieID =int(x.split()[1])))  # single column of movieID
# Convert that to a DataFrame
movieDataset = spark.createDataFrame(movies)

# Some SQL-style magic to sort all movies by popularity in one line! Cache the resulting DataSet
topMovieIDs = movieDataset.groupBy("movieID").count().orderBy("count", ascending=False).cache()

# Show the results at this point:

#|movieID|count|
#+-------+-----+
#|     50|  584|
#|    258|  509|
#|    100|  508|

topMovieIDs.show()  # show top 20
示例#28
0
文件: types.py 项目: Bekbolatov/spark
def _create_row(fields, values):
    row = Row(*values)
    row.__fields__ = fields
    return row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

from pyspark.sql import SparkSession
spark= SparkSession\
                .builder \
                .appName("dataFrame") \
                .getOrCreate()

lines = spark.read.text(
    "/home/luogan/lg/softinstall/spark-2.3.0-bin-hadoop2.7/data/mllib/als/sample_movielens_ratings.txt"
).rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]),
                                     movieId=int(p[1]),
                                     rating=float(p[2]),
                                     timestamp=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5,
          regParam=0.01,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
示例#30
0
def transformToNumeric(inputStr) :
    attList = inputStr.split(",")
    #srcip = float(attList[0])
    #srcport = float(attList[1])
    #dstip = float(attList[2])
    #dstport = float(attList[3])
    #proto = 1.0 if attList[4] == "tcp" else 0.0
    total_fpackets = float(attList[5])
    total_fvolume = float(attList[6])
    total_bpackets = float(attList[7])
    total_bvolume = float(attList[8])
    min_fpktl = float(attList[9])
    mean_fpktl = float(attList[10])
    max_fpktl = float(attList[11])
    std_fpktl = float(attList[12])
    min_bpktl = float(attList[13])
    mean_bpktl = float(attList[14])
    max_bpktl = float(attList[15])
    std_bpktl = float(attList[16])
    min_fiat = float(attList[17])
    mean_fiat = float(attList[18])
    max_fiat = float(attList[19])
    std_fiat = float(attList[20])
    min_biat = float(attList[21])
    mean_biat = float(attList[22])
    max_biat = float(attList[23])
    std_biat = float(attList[24])
    duration = float(attList[25])
    min_active = float(attList[26])
    mean_active = float(attList[27])
    max_active = float(attList[28])
    std_active = float(attList[29])
    min_idle = float(attList[30])
    mean_idle = float(attList[31])
    max_idle = float(attList[32])
    std_idle = float(attList[33])
    sflow_fpackets = float(attList[34])
    sflow_fbytes = float(attList[35])
    sflow_bpackets = float(attList[36])
    sflow_bbytes = float(attList[37])
    fpsh_cnt = float(attList[38])
    bpsh_cnt = float(attList[39])
    #furg_cnt = float(attList[40])
    #burg_cnt = float(attList[41])
    total_fhlen = float(attList[42])
    total_bhlen = float(attList[43])
    dscp = float(attList[44])
    classe = float(attList[45])

    linhas = Row(classe=classe, total_fpackets=total_fpackets, total_fvolume=total_fvolume,
                 total_bpackets=total_bpackets, total_bvolume=total_bvolume, min_fpktl=min_fpktl,
                 mean_fpktl=mean_fpktl, max_fpktl=max_fpktl, std_fpktl=std_fpktl, min_bpktl=min_bpktl,
                 mean_bpktl=mean_bpktl, max_bpktl=max_bpktl, std_bpktl=std_bpktl, min_fiat=min_fiat,
                 mean_fiat=mean_fiat, max_fiat=max_fiat, std_fiat=std_fiat, min_biat=min_biat,
                 mean_biat=mean_biat, max_biat=max_biat, std_biat=std_biat, duration=duration,
                 min_active=min_active, mean_active=mean_active, max_active=max_active,
                 std_active=std_active, min_idle=min_idle, mean_idle=mean_idle, max_idle=max_idle,
                 std_idle=std_idle, sflow_fpackets=sflow_fpackets, sflow_fbytes=sflow_fbytes,
                 sflow_bpackets=sflow_bpackets, sflow_bbytes=sflow_bbytes, fpsh_cnt=fpsh_cnt,
                 bpsh_cnt=bpsh_cnt, total_fhlen=total_fhlen,
                 total_bhlen=total_bhlen, dscp=dscp)

    return linhas
示例#31
0
def func1():
    rawUserRdd = sc.textFile(Path + "u.user")
    print("数据量 rawUserRdd.count():=", rawUserRdd.count())
    print("查看前2行:", rawUserRdd.take(2))
    user_Rows = rawUserRdd.map(lambda p: Row(
        userid=int(p[0]),
        age=int(p[1]),
        gender=p[2],
        occupation=p[3],
        zipcode=p[4]
    ))
    print("dataFrame 的前3,user_Rows.take(3):", user_Rows.take(3))
    user_df = sqlContext.createDataFrame(user_Rows)
    # 展示schema,类似表结构
    print("#展示schema,类似表结构:")
    user_df.printSchema()
    # 展示前3个数据
    print("#展示前3个数据:")
    user_df.show(3)
    # dataFrame创建别名
    df = user_df.alias("df")
    print("#dataFrame创建别名:")
    df.show(3)
    df.registerTempTable("user_table")
    print("sparkSQL 查询条数:")
    sqlContext.sql("select count(*) counts from user_table").show()
    # 多行输入,3引号的使用
    print("sparkSQL 查询条数2:")
    sqlContext.sql("""select count(*) counts 
    from user_table""").show()
    print("sparkSQL 查询数据(默认前20条):")
    sqlContext.sql("select *  from user_table").show()
    print("sparkSQL 查询数据(指定3条):")
    sqlContext.sql("select *  from user_table").show(3)
    print("sparkSQL 查询数据(指定3条使用limit,可减少运行时间):")
    sqlContext.sql("select *  from user_table limit 3").show()

    ######选择指定字段展示的三种方式,RDD,dataFrame,sql
    # RDD
    userRDDnew = rawUserRdd.map(lambda x: (x[0], x[3], x[2], x[1]))  # 选取字段
    print("使用RDD方式选取字段展示:", userRDDnew.take(3))
    # 使用dataFrame选取字段
    print("#使用dataFrame选取字段,输入字段名称字符串:")
    user_df.select("userid", "occupation", "gender", "age").show(3)
    print("#使用dataFrame选取字段,dataFrame.字段名,(dataFrame使用创建的别名也行如:df.userid,df.occupation,或者中括号也行df['occupation']选取字段:")
    user_df.select(user_df.userid, user_df.occupation, user_df.gender, user_df.age).show(3)
    # spark sql
    sqlContext.sql("select userid,occupation,gender,age  from user_table limit 3").show()

    #####增加计算字段,即有些字段数据需要计算得到
    # RDD
    userRDDnew2 = rawUserRdd.map(lambda x: (x[0], x[3], x[2], 2016 - int(x[1])))
    print("RDD计算字段:", userRDDnew2.take(3))
    # dataFrames计算值并娶一个别名,不然字段名就为2016-df.age
    print("dataframe计算字段:")
    df.select("userid", "occupation", "gender", (2016 - df.age).alias("birthyear")).show(3)
    # sparksql
    print("sparksql:")
    sqlContext.sql("select userid,occupation,gender,2016-age birthyear  from user_table").show(3)

    ######删选数据 类似where条件
    # RDD
    print("使用RDD筛选,lambda表达式:",
          rawUserRdd.filter(lambda r: r[3] == "technician" and r[2] == "M" and r[1] == 24).take(3))
    # dataframes
    # 1多个filter 相当于and
    user_df.filter("occupation='technician'").filter("gender='M'").filter("age=24").show()
    # 2单个filter配合and or not
    user_df.filter("occupation='technician'" and "gender='M'" and "age=24").show()
    # 3使用[名称].[字段] 方式,=要为==,and要为&,中括号引用类似
    df.filter((df.occupation == "technician") & (df.gerder == "M") & (df.age == 24)).show()
    # sparksql,很简单,类似sql添加where调价即可
    sqlContext.sql(
        "select userid,occupation,gender,age  from user_table where occupation='technician' and gender='M' and age=24").show(
        3)

    #####排序
    # RDD  takeOrdered
    print("RDD 排序默认升序:", rawUserRdd.takeOrdered(3, key=lambda x: int(x[1])))
    print("RDD 排序,降序(取反):", rawUserRdd.takeOrdered(3, key=lambda x: -1 * int(x[1])))
    # dataframes
    # 1升序,默认升序
    user_df.select("userid", "occupation", "gender", "age").orderBy("age").show(3)
    user_df.select("userid", "occupation", "gender", "age").orderBy(df.age).show(3)
    # 2降序
    user_df.select("userid", "occupation", "gender", "age").orderBy("age", ascending=0).show(3)
    user_df.select("userid", "occupation", "gender", "age").orderBy(df.age.desc()).show(3)
    # sparksql order by desc,asc
    sqlContext.sql("select userid,occupation,gender,age  from user_table order by age asc").show(3)
    sqlContext.sql("select userid,occupation,gender,age  from user_table order by age desc").show(3)
    ####按照多个字段排序
    # rdd
    print("RDD 多字段排序:", rawUserRdd.takeOrdered(3, key=lambda x: (-int(x[1]), x[2])))  # 现x1降序再x2升序
    # dataframes
    df.orderBy(["age", "gender"], ascending=[0, 1]).show(3)  # 0表示升序1表示降序
    df.orderBy(df.age.desc(), df.gender).show(3)
    # sparksql
    sqlContext.sql("select userid,occupation,gender,age  from user_table order by age desc,gender asc").show(3)

    #####去重
    # rdd
    print("RDd 去重:", rawUserRdd.map(lambda x: x[2]).distinct().collect())
    # 限制多个字段,类似双主键
    print("RDD 去重多字段:", rawUserRdd.map(lambda x: (x[1], x[2])).distinct().take(5))
    # dataframes
    user_df.select("gender").distinct().show()
    user_df.select("age", "gender").distinct().show()  # 多字段
    # sparksql
    sqlContext.sql("select distinct gender from user_table ").show()

    ####分组,统计
    # rdd
    print("RDD分组统计:", rawUserRdd.map(lambda x: (x[2], 1)).reduceByKey(
        lambda x, y: x + y).collect())  # map将数据变成(性别,1),reduce分别按照性别统计和
    print("RDD分组统计,多字段:",
          rawUserRdd.map(lambda x: ((x[2], x[3]), 1)).reduceByKey(lambda x, y: x + y).collect())  # 按照性别职业来统计数据
    # dataframes
    user_df.select("gender").groupBy("gender").count().show()
    user_df.select("gender", "occupation").groupBy("gender", "occupation").count().orderBy("gender", "occupation").show(
        10)
    # TODO crosstad
    user_df.stat.crosstab("occupation", "gender").show(10)
    # sparksql
    sqlContext.sql("select  gender,count(*) counts from user_table group by gender").show()
    sqlContext.sql("select  gender,occupation,count(*) counts from user_table group by gender,occupation").show(10)

    ###创建邮编数据
    ZipCodeRDD = getZipcode()
    zipcode_data = ZipCodeRDD.map(lambda p: Row(
        zipcode=int(p[0]),
        zipCodeType=p[1],
        city=p[2],
        state=p[3]
    ))
    print("zipcode前3:",zipcode_data.take(3))
    zipcode_df=sqlContext.createDataFrame(zipcode_data)
    zipcode_df.printSchema()
    #创建临时登陆表
    zipcode_df.registerTempTable("zipcode_table")
    zipcode_df.show(3)

    #####join 联接数据
    #sparksql
    sqlContext.sql("select  u.*,z.city,z.state from user_table u left join zipcode_table z on u.zipcode=z.zipcode where z.state='NY'").show(10)#查看纽约用户数据
    sqlContext.sql( "select z.state, count(*) from user_table u left join zipcode_table z on u.zipcode=z.zipcode group by z.state ").show( 10)  # 查看纽约用户数据
    #dataframes
    joined_df=user_df.join(zipcode_df,user_df.zipcode==zipcode_df.zipcode,"left_outer")
    print("dataframes 联接后:")
    joined_df.printSchema()
    #安州分组
    groupByState_df=joined_df.groupBy("state").count()

    groupByState_pandas_df=groupByState_df.toPandas().set_index("state")
    #画个直方图,安州统计数据
    ax=groupByState_pandas_df["count"].plot(kind="bar",title="State",figsize=(12,6),legend=True,fontsize=12)
    plt.show()

    #按照不同职业统计人数,并以圆饼图展示
    Occupation_df=sqlContext.sql("select u.occupation,count(*) counts from user_table u group by occupation")
    Occupation_pandas_df=Occupation_df.toPandas().set_index("occupation")
    ax2=Occupation_pandas_df["counts"].plot(kind="pie",title="occupation",figsize=(8,8),startangle=90,autopct="%1.1f%%")
    ax2.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0.)
    plt.show()
示例#32
0
    centerX.append(center[0])
    centerY.append(center[1])
    print("cluster "+ str(i)+": "+str(center)+"\n")


# In[18]:


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

get_ipython().magic(u'time WSSSE = rdd_final.map(lambda point: error(point)).reduce(lambda x, y: x + y)')
print("Within Set Sum of Squared Error = " + str(WSSSE))


# In[20]:


from pyspark.sql import Row
import pandas as pd
import matplotlib.pyplot as plt

pdf = (rdd_final.map(lambda (path,times) : Row(path=path,times=times))).toDF()
pdf = pdf.toPandas()
centers = pd.DataFrame({'x': centerX , 'y': centerY})
pdf.plot(kind='scatter',x ='path',y='times')
plt.plot(centerX,centerY,'rs')
plt.show()

示例#33
0
def fake_entry():
    name = fake.name().split()
    return Row(name[1], name[0], fake.ssn(), fake.job(),
               abs(2016 - fake.date_time().year) + 1)
#http://mail-archives.apache.org/mod_mbox/spark-user/201505.mbox/%3CCAA+15pcYAmJn_CdA8Wu4hh+JCh7b0Kmk+jAQ6S=jgVgPKgxXXg@mail.gmail.com%3E
spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false")

AWS_REGION = 'us-east-1'
MIN_SENTENCE_LENGTH_IN_CHARS = 10
MAX_SENTENCE_LENGTH_IN_CHARS = 4500
COMPREHEND_BATCH_SIZE = 25  ## This batch size results in groups no larger than 25 items
NUMBER_OF_BATCHES = 10
ROW_LIMIT = 10000

## Each task handles 25*4 records, there should be 10 partitions overall to process 1000 records.

#A PySpark.sql row in SchemaRDD. The fields in it can be accessed like attributes.
#Here, Row is used to create a Row like class that takes review_id and sentiment as attributes.
SentimentRow = Row("review_id", "sentiment")

#Defining method to get batch sentiment from Comprehend
def getBatchSentiment(input_list):
  ## You can import the ratelimit module if you want to further rate limit API calls to Comprehend
  ## https://pypi.org/project/ratelimit/
  #from ratelimit import rate_limited
  arr = []
  bodies = [i[1] for i in input_list]
  client = boto3.client('comprehend',region_name = AWS_REGION)

  #@rate_limited(1)
  def callApi(text_list):
    response = client.batch_detect_sentiment(TextList = text_list, LanguageCode = 'en')
    return response