示例#1
0
 def test_column_select(self):
     df = self.df
     self.assertEqual(self.testData, df.select("*").collect())
     self.assertEqual(self.testData, df.select(df.key, df.value).collect())
     self.assertEqual([Row(value='1')],
                      df.where(df.key == 1).select(df.value).collect())
示例#2
0
    for category_dir in listdir(
            input_dir
    ):  # Build the dataset of (docname, category, wordcounts) tuples
        distinct_labels[curr_cat] = category_dir
        next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir]))
        docs = docs.union(
            next_docs.map(lambda (doc, lines):
                          (format_text(lines), float(curr_cat))))
        curr_cat += 1

    training_rows = docs.sample(False, train_fraction)
    testing_rows = docs.subtract(training_rows)

    # Prepare training and test documents, which are labeled.
    LabeledDocument = Row("text", "label")
    train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
    test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="rawFeatures")  #outputCol="features")
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    lr = LogisticRegression(maxIter=1000, regParam=0.001)

    #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    p0 = Pipeline(stages=[tokenizer, hashingTF, idf, lr])
    #m0 = p0.fit(train)
    #pipeline = Pipeline(stages=[m0, lr])
    pipeline = p0
示例#3
0
def map_time_mobile(x):
    m_list = []
    mobile_value_time_cut = {
        '15day': [],
        '365day': [],
        '180day': [],
        '90day': [],
        '60day': [],
        '30day': [],
        '30_90day': [],
        '90_180day': [],
        '180_365day': []
    }
    if x.mobile_value:
        for l in x.mobile_value:
            check_num = 0
            if l['eventtype'] != 'Loan':
                check_num = 1
            else:
                key = 'Loan' + str(
                    int(float(l['eventoccurtime'])) / 86400 + l['partnercode'])
                if key not in m_list:
                    check_num = 1
                    m_list.append(key)
            if check_num == 1 and float(l['eventoccurtime']) <= float(
                    x.loan_date_unix):
                if float(x.loan_date_unix) - float(
                        l['eventoccurtime']) < 31536000:
                    mobile_value_time_cut['365day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventoccurtime']) < 15552000:
                    mobile_value_time_cut['180day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventoccurtime']) < 7776000:
                    mobile_value_time_cut['90day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventoccurtime']) < 5184000:
                    mobile_value_time_cut['60day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventoccurtime']) < 2592000:
                    mobile_value_time_cut['30day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventoccurtime']) < 1296000:
                    mobile_value_time_cut['15day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventocurtime']) >= 2592000 and (
                            float(x.loan_date_unix) -
                            float(l['eventoccurtime']) < 7776000):
                    mobile_value_time_cut['30_90day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventocurtime']) >= 7776000 and (
                            float(x.loan_date_unix) -
                            float(l['eventoccurtime']) < 15552000):
                    mobile_value_time_cut['90_180day'].append(l)
                if float(x.loan_date_unix) - float(
                        l['eventocurtime']) >= 15552000 and (
                            float(x.loan_date_unix) -
                            float(l['eventoccurtime']) < 31536000):
                    mobile_value_time_cut['180_365day'].append(l)
    value = x.asDict()
    value['mobile_value_time_cut'] = mobile_value_time_cut
    del value['mobile_value']
    return Row(**value)
示例#4
0
    def test_fillna(self):
        schema = StructType([
            StructField("name", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("height", DoubleType(), True),
            StructField("spy", BooleanType(), True)
        ])

        # fillna shouldn't change non-null values
        row = self.spark.createDataFrame([(u'Alice', 10, 80.1, True)],
                                         schema).fillna(50).first()
        self.assertEqual(row.age, 10)

        # fillna with int
        row = self.spark.createDataFrame([(u'Alice', None, None, None)],
                                         schema).fillna(50).first()
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, 50.0)

        # fillna with double
        row = self.spark.createDataFrame([(u'Alice', None, None, None)],
                                         schema).fillna(50.1).first()
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, 50.1)

        # fillna with bool
        row = self.spark.createDataFrame([(u'Alice', None, None, None)],
                                         schema).fillna(True).first()
        self.assertEqual(row.age, None)
        self.assertEqual(row.spy, True)

        # fillna with string
        row = self.spark.createDataFrame([(None, None, None, None)],
                                         schema).fillna("hello").first()
        self.assertEqual(row.name, u"hello")
        self.assertEqual(row.age, None)

        # fillna with subset specified for numeric cols
        row = self.spark.createDataFrame([(None, None, None, None)],
                                         schema).fillna(50,
                                                        subset=['name', 'age'
                                                                ]).first()
        self.assertEqual(row.name, None)
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, None)
        self.assertEqual(row.spy, None)

        # fillna with subset specified for string cols
        row = self.spark.createDataFrame([(None, None, None, None)],
                                         schema).fillna("haha",
                                                        subset=['name', 'age'
                                                                ]).first()
        self.assertEqual(row.name, "haha")
        self.assertEqual(row.age, None)
        self.assertEqual(row.height, None)
        self.assertEqual(row.spy, None)

        # fillna with subset specified for bool cols
        row = self.spark.createDataFrame([(None, None, None, None)],
                                         schema).fillna(True,
                                                        subset=['name', 'spy'
                                                                ]).first()
        self.assertEqual(row.name, None)
        self.assertEqual(row.age, None)
        self.assertEqual(row.height, None)
        self.assertEqual(row.spy, True)

        # fillna with dictionary for boolean types
        row = self.spark.createDataFrame([Row(a=None),
                                          Row(a=True)]).fillna({
                                              "a": True
                                          }).first()
        self.assertEqual(row.a, True)
def __construct_row(values):
    return Row(contigName="chr21",
               start=100,
               referenceAllele="A",
               alternateAlleles=["T", "C"],
               values=values)
示例#6
0
def test_rows_both_all_shows_known_diffs_flag_and_known_diffs_count_as_matches(
        spark, comparison_kd1):
    expected_df = spark.createDataFrame([
        Row(acct=10000001234,
            acct_seq=0,
            stat_cd_base='*2',
            stat_cd_compare=None,
            stat_cd_match=True,
            stat_cd_match_type="KNOWN_DIFFERENCE",
            open_dt_base=datetime.date(2017, 5, 1),
            open_dt_compare=2017121,
            open_dt_match=True,
            open_dt_match_type="KNOWN_DIFFERENCE",
            cd_base='0001',
            cd_compare=1.0,
            cd_match=True,
            cd_match_type="KNOWN_DIFFERENCE"),
        Row(acct=10000001235,
            acct_seq=0,
            stat_cd_base='V1',
            stat_cd_compare='V1',
            stat_cd_match=True,
            stat_cd_match_type="MATCH",
            open_dt_base=datetime.date(2017, 5, 2),
            open_dt_compare=2017122,
            open_dt_match=True,
            open_dt_match_type="KNOWN_DIFFERENCE",
            cd_base='0002',
            cd_compare=2.0,
            cd_match=True,
            cd_match_type="KNOWN_DIFFERENCE"),
        Row(acct=10000001236,
            acct_seq=0,
            stat_cd_base='V2',
            stat_cd_compare='V2',
            stat_cd_match=True,
            stat_cd_match_type="MATCH",
            open_dt_base=datetime.date(2017, 5, 3),
            open_dt_compare=2017123,
            open_dt_match=True,
            open_dt_match_type="KNOWN_DIFFERENCE",
            cd_base='0003',
            cd_compare=3.0,
            cd_match=True,
            cd_match_type="KNOWN_DIFFERENCE"),
        Row(acct=10000001237,
            acct_seq=0,
            stat_cd_base='*2',
            stat_cd_compare='V3',
            stat_cd_match=False,
            stat_cd_match_type="MISMATCH",
            open_dt_base=datetime.date(2017, 5, 4),
            open_dt_compare=2017124,
            open_dt_match=True,
            open_dt_match_type="KNOWN_DIFFERENCE",
            cd_base='0004',
            cd_compare=4.0,
            cd_match=True,
            cd_match_type="KNOWN_DIFFERENCE"),
        Row(acct=10000001238,
            acct_seq=0,
            stat_cd_base='*2',
            stat_cd_compare=None,
            stat_cd_match=True,
            stat_cd_match_type="KNOWN_DIFFERENCE",
            open_dt_base=datetime.date(2017, 5, 5),
            open_dt_compare=2017125,
            open_dt_match=True,
            open_dt_match_type="KNOWN_DIFFERENCE",
            cd_base='0005',
            cd_compare=5.0,
            cd_match=True,
            cd_match_type="KNOWN_DIFFERENCE")
    ])

    assert comparison_kd1.rows_both_all.count() == 5
    assert expected_df.unionAll(
        comparison_kd1.rows_both_all).distinct().count() == 5
示例#7
0
# Compute log of absolute error
def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2


###############  Start Spark ##################################
# get context, data and split the data
sc = SparkContext("local", "hw11p1")
path = "file:///home/cloudera/Documents/hw11/data/Small_Car_Data.csv"
raw_data = sc.textFile(path)

sqlContext = SQLContext(sc)
parts = raw_data.map(lambda l: l.split(","))

pre_df = parts.map(lambda p: Row(displacement=p[3], hspower=p[4]))

# create dataframe for cleaning the data later on
df = sqlContext.createDataFrame(pre_df)

# Count the number of rows before cleaning the data ( via filtering)
print >> f, "Before filtering count="
print >> f, df.count()

# cleaning the data
dff = df.where((df.displacement != 'NaN') & (df.hspower != 'NaN'))

# Count the number of rows after cleaning the data ( via filtering)
print >> f, "After filtering count="
print >> f, dff.count()
 def test_corr(self):
     import math
     df = self.sc.parallelize([Row(a=i, b=math.sqrt(i))
                               for i in range(10)]).toDF()
     corr = df.stat.corr(u"a", "b")
     self.assertTrue(abs(corr - 0.95734012) < 1e-6)
sc = spark.sparkContext

# In[3]:

lines = sc.textFile('people.txt')

# In[4]:

# Inferring the Schema Using Reflection
parts = lines.map(lambda l: l.split(","))

# Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes.
# Rows are constructed by passing a list of key/value pairs as kwargs to the Row class.
# The keys of this list define the column names of the table, and the types are inferred
# by sampling the whole dataset, similar to the inference that is performed on JSON files.
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# In[5]:

# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createTempView("people")

# In[6]:

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

# In[7]:

teenagers
 def test_expr(self):
     from pyspark.sql import functions
     row = Row(a="length string", b=75)
     df = self.spark.createDataFrame([row])
     result = df.select(functions.expr("length(a)")).collect()[0].asDict()
     self.assertEqual(13, result["length(a)"])
    def test_sort_with_nulls_order(self):
        from pyspark.sql import functions

        df = self.spark.createDataFrame([('Tom', 80), (None, 60),
                                         ('Alice', 50)], ["name", "height"])
        self.assertEqual(
            df.select(df.name).orderBy(
                functions.asc_nulls_first('name')).collect(),
            [Row(name=None),
             Row(name=u'Alice'),
             Row(name=u'Tom')])
        self.assertEqual(
            df.select(df.name).orderBy(
                functions.asc_nulls_last('name')).collect(),
            [Row(name=u'Alice'),
             Row(name=u'Tom'),
             Row(name=None)])
        self.assertEqual(
            df.select(df.name).orderBy(
                functions.desc_nulls_first('name')).collect(),
            [Row(name=None),
             Row(name=u'Tom'),
             Row(name=u'Alice')])
        self.assertEqual(
            df.select(df.name).orderBy(
                functions.desc_nulls_last('name')).collect(),
            [Row(name=u'Tom'),
             Row(name=u'Alice'),
             Row(name=None)])
 def test_dayofweek(self):
     from pyspark.sql.functions import dayofweek
     dt = datetime.datetime(2017, 11, 6)
     df = self.spark.createDataFrame([Row(date=dt)])
     row = df.select(dayofweek(df.date)).first()
     self.assertEqual(row[0], 2)
 def test_bit_length_function(self):
     # SPARK-36751: add bit length api for python
     df = self.spark.createDataFrame([('cat', ), ('\U0001F408', )], ['cat'])
     actual = df.select(bit_length('cat')).collect()
     self.assertEqual([Row(24), Row(32)], actual)
 def test_cov(self):
     df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
     cov = df.stat.cov(u"a", "b")
     self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)
示例#15
0
# expr('someCol - 5') is the same transformation as performing col('someCol') - 5, or even expr('someCol') - 5
# This might be confusing, but remember a couple of key points:
#   - Columns are just expression - they evaluate to a value if we do expr('count') it evaluates to a value for each row
#   - Columns and transformations of those columns compile to the same logical plan as parsed expression

(((col('someCol') + 5) * 200) - 6) < col('otherCol')

# You can write your expressions as DataFrame code or as SQL expressions and get the same performance characteristics.
df.select(expr('(((count + 5) * 200) - 6)')).show(5)
df.select((((col('count') + 5) * 200) - 6)).show(5)


from pyspark.sql import Row

# Accessing data in rows: you can specify the position that you would like
myRow = Row('Hello', None, 1)

r1 = myRow[0]
print(r1) # hello

df.createOrReplaceTempView('dfTable')

# Create DataFrame on the fly

my_schema = StructType([
        StructField('some', StringType(), True),
        StructField('col', StringType(), True),
        StructField('names', LongType(), False, metadata={'hello': 'world'})
        ])

myDF = spark.createDataFrame([myRow], my_schema)
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import sys

spark = SparkSession.builder.appName("ALS").getOrCreate()

lines = spark.read.text(sys.argv[1]).rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(
    lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2])))
ratings = spark.createDataFrame(ratingsRDD)

splits = [0.9, 0.85, 0.8, 0.75, 0.7]

f = open("ALS_out_2.txt", "w")

for i in splits:
    (training, test) = ratings.randomSplit([i, 1 - i])
    als = ALS(maxIter=20,
              regParam=0.1,
              userCol="userId",
              itemCol="movieId",
              ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
示例#17
0
def test_rows_both_all_returns_all_rows_in_both_dataframes_for_differently_named_columns(
        spark, comparison3):
    expected_df = spark.createDataFrame([
        Row(acct=10000001234,
            dollar_amt_base=123,
            dollar_amt_compare=123.4,
            dollar_amt_match=False,
            name_base='George Maharis',
            name_compare='George Michael Bluth',
            name_match=False,
            float_fld_base=14530.1555,
            float_fld_compare=14530.155,
            float_fld_match=False,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True,
            accnt_purge=False),
        Row(acct=10000001235,
            dollar_amt_base=0,
            dollar_amt_compare=0.45,
            dollar_amt_match=False,
            name_base='Michael Bluth',
            name_compare='Michael Bluth',
            name_match=True,
            float_fld_base=1.0,
            float_fld_compare=1.0,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True,
            accnt_purge=False),
        Row(acct=10000001236,
            dollar_amt_base=1345,
            dollar_amt_compare=1345.0,
            dollar_amt_match=True,
            name_base='George Bluth',
            name_compare='George Bluth',
            name_match=True,
            float_fld_base=None,
            float_fld_compare=None,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True,
            accnt_purge=False),
        Row(acct=10000001237,
            dollar_amt_base=123456,
            dollar_amt_compare=123456.0,
            dollar_amt_match=True,
            name_base='Bob Loblaw',
            name_compare='Bob Loblaw',
            name_match=True,
            float_fld_base=345.12,
            float_fld_compare=345.12,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True,
            accnt_purge=False),
        Row(acct=10000001239,
            dollar_amt_base=1,
            dollar_amt_compare=1.05,
            dollar_amt_match=False,
            name_base='Lucille Bluth',
            name_compare='Lucille Bluth',
            name_match=True,
            float_fld_base=None,
            float_fld_compare=None,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True,
            accnt_purge=True)
    ])

    assert comparison3.rows_both_all.count() == 5
    assert expected_df.unionAll(
        comparison3.rows_both_all).distinct().count() == 5
示例#18
0
os.environ["PYSPARK_SUBMIT_ARGS"] = ( "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11")
sc = SparkContext(master="local[*]", appName="graphFrames-community-detection")
sqlContext = SQLContext(sc)
sc.setLogLevel("ERROR")

inputFile = sys.argv[1]
outputFile = sys.argv[2]

inpData = sc.textFile(inputFile)
rddData = inpData.map(lambda x : x.split(' '))
rdd = rddData.union(inpData.map(lambda x : x.split(' ')[::-1]))\
	.persist()

edgeSchema = StructType([StructField('src', StringType()), StructField('dst',StringType())])
vertexSchema = StructType([StructField('id', StringType())])
graphVertices= rdd.flatMap(lambda v: [Row(v[0]),Row(v[1])])\
	.distinct()
graphEdges = rdd.map(lambda x: (x[0], x[1]))\
	.map(lambda x: Row(src=x[0], dst=x[1]))
verticesDataframe= sqlContext.createDataFrame(graphVertices, vertexSchema)
edgesDataframe = sqlContext.createDataFrame(graphEdges, edgeSchema)

graph = GraphFrame(verticesDataframe, edgesDataframe)
graphList = graph.labelPropagation(maxIter=5)\
	.rdd\
	.map(lambda x: (x[1], x[0]))\
	.groupByKey()\
	.map(lambda x: x[1])\
	.collect()

sortedGraph = list()
示例#19
0
def test_rows_both_all_returns_a_dataframe_with_all_rows_in_identical_dataframes(
        spark, comparison2):
    expected_df = spark.createDataFrame([
        Row(acct=10000001234,
            dollar_amt_base=123,
            dollar_amt_compare=123,
            dollar_amt_match=True,
            name_base='George Maharis',
            name_compare='George Maharis',
            name_match=True,
            float_fld_base=14530.1555,
            float_fld_compare=14530.1555,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True),
        Row(acct=10000001235,
            dollar_amt_base=0,
            dollar_amt_compare=0,
            dollar_amt_match=True,
            name_base='Michael Bluth',
            name_compare='Michael Bluth',
            name_match=True,
            float_fld_base=1.0,
            float_fld_compare=1.0,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True),
        Row(acct=10000001236,
            dollar_amt_base=1345,
            dollar_amt_compare=1345,
            dollar_amt_match=True,
            name_base='George Bluth',
            name_compare='George Bluth',
            name_match=True,
            float_fld_base=None,
            float_fld_compare=None,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True),
        Row(acct=10000001237,
            dollar_amt_base=123456,
            dollar_amt_compare=123456,
            dollar_amt_match=True,
            name_base='Bob Loblaw',
            name_compare='Bob Loblaw',
            name_match=True,
            float_fld_base=345.12,
            float_fld_compare=345.12,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True),
        Row(acct=10000001239,
            dollar_amt_base=1,
            dollar_amt_compare=1,
            dollar_amt_match=True,
            name_base='Lucille Bluth',
            name_compare='Lucille Bluth',
            name_match=True,
            float_fld_base=None,
            float_fld_compare=None,
            float_fld_match=True,
            date_fld_base=datetime.date(2017, 1, 1),
            date_fld_compare=datetime.date(2017, 1, 1),
            date_fld_match=True)
    ])

    assert comparison2.rows_both_all.count() == 5
    assert expected_df.unionAll(
        comparison2.rows_both_all).distinct().count() == 5
示例#20
0
sc.setLogLevel("WARN")

ssc = StreamingContext(sc, 60)
sqlContext = SQLContext(sc)

kafkaStream = KafkaUtils.createStream(ssc, 'data04:2181',
                                      'trump-consumer-group2', {'trump': 1})

dataJson = kafkaStream.map(lambda x: json.loads(x[1]))
messages = dataJson.map(lambda x: (x[
    'text'], datetime.strptime(x['created_at'], '%a %b %d %H:%M:%S %z %Y').
                                   replace(tzinfo=timezone.utc)))
messages_downsecs = messages.map(lambda x: (x[0], x[1] - timedelta(
    seconds=x[1].second, microseconds=x[1].microsecond)))

parts = messages_downsecs.map(lambda x: Row(
    tweet=x[0], sentence=processTweetText(x[0]), created_at=x[1].isoformat()))
#parts.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

#partsDF = parts.transform(lambda rdd: rdd.toDF().collect())
#partsDF = parts.transform(lambda rdd: rdd.collect())
#partsDF = parts.transform(lambda rdd: rdd.collect())

schema = StructType([StructField('sentence', StringType(), True)])
partsDF = sqlContext.createDataFrame(sc.emptyRDD(), schema)


def RDDsToDF2(rdd, partsDF):
    print(partsDF)
    print(rdd.count())

示例#21
0
def transformToNumeric(inputStr):
    attList = inputStr.split(",")
    #srcip = float(attList[0])
    #srcport = float(attList[1])
    #dstip = float(attList[2])
    #dstport = float(attList[3])
    #proto = 1.0 if attList[4] == "tcp" else 0.0
    total_fpackets = float(attList[5])
    total_fvolume = float(attList[6])
    total_bpackets = float(attList[7])
    total_bvolume = float(attList[8])
    min_fpktl = float(attList[9])
    mean_fpktl = float(attList[10])
    max_fpktl = float(attList[11])
    std_fpktl = float(attList[12])
    min_bpktl = float(attList[13])
    mean_bpktl = float(attList[14])
    max_bpktl = float(attList[15])
    std_bpktl = float(attList[16])
    min_fiat = float(attList[17])
    mean_fiat = float(attList[18])
    max_fiat = float(attList[19])
    std_fiat = float(attList[20])
    min_biat = float(attList[21])
    mean_biat = float(attList[22])
    max_biat = float(attList[23])
    std_biat = float(attList[24])
    duration = float(attList[25])
    min_active = float(attList[26])
    mean_active = float(attList[27])
    max_active = float(attList[28])
    std_active = float(attList[29])
    min_idle = float(attList[30])
    mean_idle = float(attList[31])
    max_idle = float(attList[32])
    std_idle = float(attList[33])
    sflow_fpackets = float(attList[34])
    sflow_fbytes = float(attList[35])
    sflow_bpackets = float(attList[36])
    sflow_bbytes = float(attList[37])
    fpsh_cnt = float(attList[38])
    bpsh_cnt = float(attList[39])
    furg_cnt = float(attList[40])  #retirar
    burg_cnt = float(attList[41])  #retirar
    total_fhlen = float(attList[42])
    total_bhlen = float(attList[43])
    dscp = float(attList[44])
    classe = float(attList[45])

    linhas = Row(classe=classe,
                 total_fpackets=total_fpackets,
                 total_fvolume=total_fvolume,
                 total_bpackets=total_bpackets,
                 total_bvolume=total_bvolume,
                 min_fpktl=min_fpktl,
                 mean_fpktl=mean_fpktl,
                 max_fpktl=max_fpktl,
                 std_fpktl=std_fpktl,
                 min_bpktl=min_bpktl,
                 mean_bpktl=mean_bpktl,
                 max_bpktl=max_bpktl,
                 std_bpktl=std_bpktl,
                 min_fiat=min_fiat,
                 mean_fiat=mean_fiat,
                 max_fiat=max_fiat,
                 std_fiat=std_fiat,
                 min_biat=min_biat,
                 mean_biat=mean_biat,
                 max_biat=max_biat,
                 std_biat=std_biat,
                 duration=duration,
                 min_active=min_active,
                 mean_active=mean_active,
                 max_active=max_active,
                 std_active=std_active,
                 min_idle=min_idle,
                 mean_idle=mean_idle,
                 max_idle=max_idle,
                 std_idle=std_idle,
                 sflow_fpackets=sflow_fpackets,
                 sflow_fbytes=sflow_fbytes,
                 sflow_bpackets=sflow_bpackets,
                 sflow_bbytes=sflow_bbytes,
                 fpsh_cnt=fpsh_cnt,
                 bpsh_cnt=bpsh_cnt,
                 furg_cnt=furg_cnt,
                 burg_cnt=burg_cnt,
                 total_fhlen=total_fhlen,
                 total_bhlen=total_bhlen,
                 dscp=dscp)

    return linhas
示例#22
0
sc = spark.sparkContext

###########################  Q2A  ##############################

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# In[3]:

lines = spark.read.option("header", "true").csv("Data/ratings.csv").rdd

# In[4]:

ratingsRDD = lines.map(lambda p: Row(userId=int(p[0]),
                                     movieId=int(p[1]),
                                     rating=float(p[2]),
                                     timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

# In[5]:

ratings.show(5)

# In[6]:

(split1, split2, split3) = ratings.randomSplit([0.33, 0.33, 0.34])

# split traing data and test data

training1 = split2.union(split3)
test1 = split1
def parseInput(line):
    fields = line.split()
    return Row(movieID = int(fields[1]), rating = float(fields[2]))
示例#24
0
 def test_schema(self):
     data = [Row(x=float(x)) for x in range(100)]
     df = self.sql.createDataFrame(data)
     tfs.print_schema(df)
            movieNames[int(fields[0])] = fields[1]
    return movieNames


# Create a SparkSession (the config bit is only for Windows!)
spark = SparkSession.builder.config("spark.sql.warehouse.dir").appName(
    "PopularMovies").getOrCreate()

# Load up our movie ID -> name dictionary
nameDict = loadMovieNames()
#
## Get the raw data
lines = spark.sparkContext.textFile(
    "/Users/ameyapatankar/Desktop/SparkCourse/ml-100k/u.data")
## Convert it to a RDD of Row objects
movies = lines.map(lambda x: Row(movieID=int(x.split()[1])))
## Convert that to a DataFrame
movieDataset = spark.createDataFrame(movies)
#
## Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = movieDataset.groupBy("movieID").count().orderBy(
    "count", ascending=False).cache()
#
## Show the results at this point:
#
##|movieID|count|
##+-------+-----+
##|     50|  584|
##|    258|  509|
##|    100|  508|
#
示例#26
0
    .config("spark.executor.memory", "500mb") \
    .appName("Exercise1") \
    .getOrCreate()

products = spark.read.parquet("datasets/products_parquet/")
sales = spark.read.parquet("datasets/sales_parquet/")

REPLICATION_FACTOR = 101

skewed_keys = sales.groupby("product_id").count().orderBy("count").limit(100).collect()

replicated_products = [product["product_id"] for product in skewed_keys]
salted_keys = [(product_id, part_id) for product_id in replicated_products for part_id in range(REPLICATION_FACTOR)]

rdd = spark.sparkContext.parallelize(salted_keys)
replicated_df = rdd.map(lambda x: Row(product_id=x[0], replication=int(x[1])))
replicated_df = spark.createDataFrame(replicated_df)

products = products.join(
        other=broadcast(replicated_df),
        on=products["product_id"] == replicated_df["product_id"],
        how="left") \
    .withColumn(
        colName="salted_join_key",
        col=when(replicated_df["replication"].isNull(), products["product_id"])
            .otherwise(concat(replicated_df["product_id"], lit("-"), replicated_df["replication"]))
    )

sales = sales.withColumn(
    colName="salted_join_key",
    col=when(
    # creating a DataFrame from list of tuples:
    # Create a list of tuples. 
    # Each tuple contains name, city, and age.
    # Create a RDD from the list above.
    # Convert each tuple to a row.
    # Create a DataFrame by applying createDataFrame 
    # on RDD with the help of sqlContext.
    list_of_tuples= [('alex','Sunnyvale', 25), ('mary', 'Cupertino', 22), ('jane', 'Ames', 20), ('bob', 'Stanford', 26)]
    print("list_of_tuples = ", list_of_tuples)
    rdd = spark.sparkContext.parallelize(list_of_tuples)
    print("rdd = ", rdd)
    print("rdd.count() = ", rdd.count())
    print("rdd.collect() = ", rdd.collect())
    
    # convert rdd (as RDD[(String, String, Integer)] into RDD[Row]
    people = rdd.map(lambda x: Row(name=x[0], city=x[1], age=int(x[2])))
    print("people = ", people)
    print("people.count() = ", people.count())
    print("people.collect() = ", people.collect())

    # create a DataFrame as df
    df = spark.createDataFrame(people)
    print("df = ", df)
    print("df.count() = ", df.count())
    print("df.collect() = ", df.collect())
    df.show()
    df.printSchema()
       
    # done!
    spark.stop()
示例#28
0
def generate_spark_graph(strings, sc, mat=None, min_ld=1, max_ld=1):
    """
    Make a graph using the Spark graphframes library

    Inputs
    ------

    strings: list
        a list of strings to use for the pairwise distance matrix
    sc : pyspark.SparkContext
        a live SparkContext
    mat : pyspark.RDD, optional
        an RDD representing the distance matrix (returned by `distance_matrix`). If not given, 
        it is generated automatically
    min_ld : int, optional
        minimum Levenshtein distance
    max_ld : int, optional
        maximum Levenshtein distance

    Returns
    -------
    g : graphframes.GraphFrame object with strings as node names

    """
    try:
        import findspark
        findspark.init()
        import graphframes
        from pyspark.sql import Row, SQLContext
        from pyspark.sql.types import StructField, StructType, IntegerType, ShortType, StringType, LongType
    except:
        warn(
            'Problem importing pyspark -- are you sure your SPARK_HOME is set?'
        )

    sqc = SQLContext(sc)

    strings_b = sc.broadcast(strings)
    size = len(strings)

    # make the vertex DataFrame
    v_schema = StructType([
        StructField('id', IntegerType()),
        StructField('string', StringType())
    ])
    v_rdd = sc.parallelize(
        range(size)).map(lambda x: Row(id=x, string=strings_b.value[x]))
    v = sqc.createDataFrame(v_rdd, schema=v_schema)

    # make the edge DataFrame
    if mat is None:
        mat = distance_matrix(strings, min_ld=min_ld, max_ld=max_ld, sc=sc)
    e_schema = StructType([
        StructField('src', IntegerType()),
        StructField('dst', IntegerType()),
        StructField('weight', ShortType())
    ])
    e = sqc.createDataFrame(mat, schema=e_schema)
    gf = graphframes.GraphFrame(v, e)

    return gf
示例#29
0
 def test_sampleby(self):
     df = self.sc.parallelize([Row(a=i, b=(i % 3))
                               for i in range(100)]).toDF()
     sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0)
     self.assertTrue(sampled.count() == 35)
示例#30
0
from pyspark.sql import Row
from pyspark.sql.functions import countDistinct
import pandas as pd
import os
import matplotlib.pyplot as plt

spark = SparkSession.builder.master("local").appName("Collborative Filering").getOrCreate()

# load csv
rawdf = spark.read.csv("ratings_Sports_and_Outdoors.csv").toDF('userId', 'itemId', 'rating', 'timestamp')

# map userId and itemId to iteger
userIdIntMap = rawdf.rdd.map(lambda r: r.userId).distinct().zipWithUniqueId().collectAsMap()
itemIdIntMap = rawdf.rdd.map(lambda r: r.itemId).distinct().zipWithUniqueId().collectAsMap()
rawdf = rawdf.rdd.map(
    lambda d: Row(userId=userIdIntMap.get(d.userId), itemId=itemIdIntMap.get(d.itemId), rating=float(d.rating))).toDF()

# get ratings' count grouped by userId
counts = rawdf.groupBy(rawdf.userId).count()
counts_group = counts.rdd.map(lambda row:(row['count'], 1)).reduceByKey(lambda a, b: a+b).sortByKey(ascending=True).collect()
print(counts_group)
# musical instruments
#(1, 270914), (2, 39277), (3, 13015), (4, 5968), (5, 3226), (6, 1973), (7, 1230), (8, 786), (9, 569), (10, 430), (11, 356), (12, 247), (13, 192), (14, 143), (15, 121), (16, 96), (17, 95), (18, 58), (19, 64), (20, 51), (21, 46), (22, 45), (23, 34), (24, 31), (25, 25), (26, 22), (27, 9), (28, 21), (29, 15), (30, 12), (31, 14), (32, 12), (33, 6), (34, 3), (35, 8), (36, 8), (37, 5), (38, 4), (39, 5), (40, 8), (41, 1), (42, 4), (43, 2), (44, 1), (46, 3), (47, 4), (48, 5), (49, 3), (50, 3), (51, 4), (52, 3), (53, 1), (55, 5), (56, 1), (58, 1), (59, 2), (60, 1), (61, 2), (62, 3), (63, 1), (64, 3), (67, 3), (69, 1), (71, 2), (72, 2), (76, 1), (77, 1), (82, 2), (84, 1), (86, 2), (89, 2), (94, 1), (97, 1), (99, 1), (101, 1), (106, 1), (108, 1), (110, 2), (113, 1), (114, 1), (118, 1), (126, 1), (135, 1), (154, 1), (454, 1), (463, 1), (483, 1)]

# sports
#[(1, 1463787), (2, 288644), (3, 104732), (4, 48990), (5, 26588), (6, 16106), (7, 10328), (8, 7123), (9, 5053), (10, 3678), (11, 2834), (12, 2167), (13, 1711), (14, 1305), (15, 1081), (16, 851), (17, 727), (18, 592), (19, 484), (20, 407), (21, 363), (22, 324), (23, 268), (24, 227), (25, 184), (26, 193), (27, 182), (28, 147), (29, 115), (30, 111), (31, 118), (32, 72), (33, 75), (34, 82), (35, 65), (36, 68), (37, 63), (38, 54), (39, 41), (40, 33), (41, 32), (42, 27), (43, 28), (44, 30), (45, 25), (46, 29), (47, 19), (48, 22), (49, 19), (50, 17), (51, 12), (52, 11), (53, 18), (54, 10), (55, 11), (56, 10), (57, 16), (58, 16), (59, 11), (60, 7), (61, 7), (62, 6), (63, 9), (64, 4), (65, 7), (66, 4), (67, 4), (68, 10), (69, 8), (70, 3), (71, 5), (72, 3), (73, 5), (74, 4), (75, 6), (76, 2), (77, 6), (78, 3), (79, 4), (80, 6), (81, 2), (82, 3), (83, 3), (84, 3), (85, 1), (87, 1), (88, 3), (89, 1), (90, 2), (91, 2), (92, 1), (94, 6), (95, 1), (97, 1), (98, 1), (99, 3), (101, 2), (103, 1), (104, 2), (105, 1), (106, 1), (109, 4), (111, 1), (112, 1), (113, 1), (115, 2), (117, 1), (119, 3), (120, 2), (122, 1), (123, 1), (124, 1), (129, 1), (131, 1), (143, 1), (147, 1), (153, 1), (155, 1), (156, 1), (159, 1), (168, 1), (194, 1), (224, 1), (254, 1), (403, 1)]


# counts_keys = [count[0] for count in counts_group]
# counts_values = [count[1] for count in counts_group]