def anon_firstlastname_spark(df, blacklist, firstnames, lastnames,
                             *columns_to_anon):
    from pyspark.sql.functions import udf, StringType, col

    sc = df._sc

    blacklist = set(map(str.lower, blacklist))
    firstnames = set(map(str.lower, firstnames))
    lastnames = set(map(str.lower, lastnames))

    blacklist_bc = sc.broadcast(blacklist)
    firstnames_bc = sc.broadcast(firstnames)
    lastnames_bc = sc.broadcast(lastnames)
    firstlastnames_bc = sc.broadcast(firstnames | lastnames)

    anonymizer = AnonFirstLastName(blacklist_bc, firstnames_bc, lastnames_bc,
                                   firstlastnames_bc)
    anonymizer_udf = udf(anonymizer.__call__, StringType())

    result = df
    for colname in columns_to_anon:
        print("Anonymizing column {}".format(colname))
        result = result.withColumn(colname + "_anonymized",
                                   anonymizer_udf(col(colname))).drop(colname)

    return result
Пример #2
0
def map_tag(momo2tag):
    def mapping(col):
        tag = momo2tag.get(col,'')
        return tag
    return udf(mapping, StringType())
Пример #3
0
def translate(tag):
    def translate_(col):
        data = json.loads(col)
        return data.get(tag)
    return udf(translate_, StringType())
# instantiate SparkContext for spark-submit
conf = SparkConf().setAppName("transforming effective care and readmissions")
sc = SparkContext(conf=conf)
# read Hive tables
hive_context = HiveContext(sc)
measure_effective_care = hive_context.table("default.measure_effective_care")
# filter
measure_effective_care.registerTempTable("effective_temp")
measure_effective_selected = hive_context.sql(
    " SELECT provider_id, measure_id,score,sample,measure_start,measure_end FROM effective_temp"
)
# add a column to flag the data as effective care
measure_effective_with_care_type = measure_effective_selected.withColumn(
    'care_type',
    lit("effective").cast(StringType()))
# add an empty column for readmission denominators
measure_effective_with_care_type_denominator = measure_effective_with_care_type.withColumn(
    'denominator',
    lit(None).cast(StringType()))

# read in readmission data
measure_readmission = hive_context.table("default.measure_readmission")
measure_readmission.registerTempTable("readmission_temp")
measure_readmission_selected = hive_context.sql(
    " SELECT provider_id, measure_id,denominator,score,measure_start,measure_end FROM readmission_temp"
)
# prepare readmissions for a union with effective measures: add empty column for sample
measure_readmission_with_sample = measure_readmission_selected.withColumn(
    'sample',
    lit(None).cast(StringType()))
Пример #5
0
import spark_functions as sf
from pyspark.sql.functions import udf, StringType

if __name__ == '__main__':

    table_to_read = 'unified_minute_view'
    table_to_write = 'unified_hour'

    sf.initialize_environment()

    new_sql_context = sf.create_sql_context('spark_minute_to_hour')
    udf_aggregator = udf(sf.minute_aggregator, StringType())

    hour_table = sf.spark_unified_aggregator(
        udf_aggregator, sf.load_and_get_table_df(new_sql_context,
                                                 table_to_read))
    sf.write_df_to_table(hour_table, table_to_write)
   "via","while","posrr","scorr","thankyou","across","alone","another","becoming","bottom","due","ever","formerly","hereafter",
   "is","moreover","of","please","so","then","thus","w","whither","posrt","scodi","along","any","been","but","during","every",
   "forty","hereby","it","most","often","put","some","thence","tl""was","who","posdi","scodu","after","already","anyhow","before",
   "by","each","everyone","fosm","herein","its","mostly","once","rather","somehow","there","tm","we","whoever","posdu","scord","afterwards",
   "also","anyone","beforehand"])
   cleaned=' '.join([w for w in sentence.split() if not w in stops])
   cleaned=' '.join([w for w in cleaned.split() if not len(w)<2 ])
   cleaned=cleaned
   if(len(cleaned)<=1):
      return "NA"
   else:
      return cleaned



org_val=udf(cleaning_text,StringType())
data=data.withColumn("cleaned",org_val(data.summary))
data = data.filter(data["cleaned"]!= "NA")

tokenizer = Tokenizer(inputCol="cleaned", outputCol="words")
wordsData = tokenizer.transform(data)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# assembler=VectorAssembler(inputCols=['op1','op2','op3','pr1','features'],outputCol='FeaturesFinal')
# assembled=assembler.transform(rescaledData)

ADS=rescaledData.select('label','features')
Пример #7
0
def transform(df):
    """ Transform dataframe to an acceptable form.

    Args:
        df: raw dataframe

    Return:
        df: processed dataframe
    """
    # todo: write the your code here
    # Remove user field
    df_flatten_one_level = df.select('message.*', 'kafka_consume_ts')
    df_flatten_one_level = df_flatten_one_level.drop('user')

    # Remove retweeted_status and quoted_status if they are available in JSON objects and add them to dataframe as new
    #  rows.
    df_retweeted_status = df_flatten_one_level \
        .where(df_flatten_one_level.retweeted_status.isNotNull()) \
        .select('retweeted_status.*') \
        .drop('user') \
        .withColumn('keyword', lit(None).cast(StringType())) \
        .withColumn('kafka_consume_ts', lit(None).cast(StringType()))

    df_quoted_status = df_flatten_one_level \
        .where(df_flatten_one_level.quoted_status.isNotNull()) \
        .select('quoted_status.*') \
        .drop('user') \
        .withColumn('keyword', lit(None).cast(StringType())) \
        .withColumn('kafka_consume_ts', lit(None).cast(StringType()))

    df_flatten_one_level = df_flatten_one_level.drop("retweeted_status",
                                                     "quoted_status")

    common_unnested_columns = [
        'created_at', 'favorite_count', 'favorited', 'id', 'id_str',
        'in_reply_to_screen_name', 'in_reply_to_status_id',
        'in_reply_to_status_id_str', 'in_reply_to_user_id',
        'in_reply_to_user_id_str', 'is_quote_status', 'keyword', 'lang',
        'possibly_sensitive', 'quoted_status_id', 'quoted_status_id_str',
        'retweet_count', 'retweeted', 'source', 'text', 'truncated',
        'kafka_consume_ts'
    ]

    df_flatten_one_level = df_flatten_one_level.select(common_unnested_columns) \
        .unionAll(df_retweeted_status.select(common_unnested_columns)) \
        .unionAll(df_quoted_status.select(common_unnested_columns))

    # Remove duplicate tweets.
    df_without_duplicate = df_flatten_one_level.dropDuplicates(subset=['id'])

    # Remove space characters from text fields.
    for item in df_without_duplicate.dtypes:
        if item[1].startswith('string'):
            df_without_duplicate = df_without_duplicate.withColumn(
                item[0], trim(df_without_duplicate[item[0]]))

    # Convert created_at field to DateTime with (year-month-day) format.
    split_col = split(df_without_duplicate['created_at'], ' ')
    df_with_date = df_without_duplicate.withColumn(
        'created_at_date',
        concat_ws('-', split_col.getItem(5), split_col.getItem(1),
                  split_col.getItem(2)))
    df_final = df_with_date.withColumn(
        "created_at_date",
        from_unixtime(
            unix_timestamp(df_with_date.created_at_date, 'yyyy-MMM-dd'),
            'yyyy-MM-dd'))

    return df_final
Пример #8
0
def translate():
    def translate_(col):

        hash_md5 = hashlib.md5(col)
        return hash_md5.hexdigest()
    return udf(translate_, StringType())
Пример #9
0
    def __init__(self, sc, data_path):
        self.sc = sc
        self.data_path = data_path
        #sc = SparkContext.getOrCreate()
        self.sqlContext = SQLContext(sc)
        #incluir aqui repositorio a S3

        bigT = sc.textFile(data_path, 2)
        bigTT = bigT.map(logs2)
        rows = bigTT.map(lambda x: Row(visitorID=x[0],
                                       url=x[1],
                                       action=x[2],
                                       pais=x[3],
                                       provincia=x[4],
                                       time=x[5]))
        dailyMaster = self.sqlContext.createDataFrame(rows)
        ndf = dailyMaster.withColumn('_1',
                                     dailyMaster['time'].cast(DateType()))
        ndf2 = ndf.withColumn('_1', dailyMaster['time'].cast(DateType()))

        def url2(x):
            try:
                a = x.split('//')[1]

            except:

                a = "0"

            return a

        udf2 = udf(lambda x: url2(x), StringType())

        def hosta(x):
            try:
                a = x.split('/')[0]

            except:

                a = "0"

            return a

        udf3 = udf(lambda x: hosta(x), StringType())

        def path(x, n):
            try:
                a = x.split('/')[n]

            except:

                a = "0"

            return a

        #vamos recuperando todas las componentes de la url
        udf4 = udf(lambda x: path(x, 1), StringType())
        udf5 = udf(lambda x: path(x, 2), StringType())
        udf6 = udf(lambda x: path(x, 3), StringType())
        udf7 = udf(lambda x: path(x, 4), StringType())

        ndf_url = ndf2.withColumn('urlClean', udf2(ndf2.url))
        ndf_host = ndf_url.withColumn('host', udf3(ndf_url.urlClean))
        ndf_path1 = ndf_host.withColumn('path1', udf4(ndf_host.urlClean))
        self.ndf5 = ndf_path1.withColumn('path2', udf5(ndf_path1.urlClean))
Пример #10
0
import spark_functions as sf
from pyspark.sql.functions import udf, StringType, col


if __name__ == '__main__':

    table_to_write = 'unified_minute'
    value_name = 'follower_count'
    value_alias = 'follower_count'

    sf.initialize_environment()

    new_sql_context = sf.create_sql_context('spark_live_to_minute')
    udf_aggregator = udf(sf.second_aggregator, StringType())

    # Get live tables and aggregate into minutes

    youtube_minute = sf.spark_live_aggregator(udf_aggregator, sf.load_and_get_table_df(new_sql_context, 'youtube_live_view'),
                                              value_name, value_alias)
    twitch_minute = sf.spark_live_aggregator(udf_aggregator, sf.load_and_get_table_df(new_sql_context, 'twitch_live_view'),
                                             value_name, value_alias)
    twitter_minute = sf.spark_live_aggregator(udf_aggregator, sf.load_and_get_table_df(new_sql_context, 'twitter_live_view'),
                                              value_name, value_alias)

    joined_minute_tables = sf.join_3_tables_by_streamer_and_timestamp(youtube_minute, twitch_minute, twitter_minute)
    sf.write_df_to_table(joined_minute_tables
                         .withColumn('total_count', col('youtube_count') + col('twitch_count') + col('twitter_count')),
                         table_to_write)
# Run the function over the entire data set
all_genres = genre_counts.collect()
movies_add_genres = ratings_with_titles

for genre in all_genres:
    movies_add_genres = process_genres(movies_add_genres, genre[0])

print("--Genres function applied to all movies--")
movies_add_genres.show(20, False)


# Extract the year from the movie title
def extract_year(title_and_year):
    if "(" in str(title_and_year):
        return str(title_and_year).split("(")[-1].replace(")", "")
    else:
        return None


from pyspark.sql.functions import StringType

year_udf = udf(extract_year, StringType())

movies_with_year = movies_add_genres.withColumn(
    "year", year_udf(movies_add_genres["title"]))
print("--Movie years--")
movies_with_year.show(20, False)

movies_pandas = movies_with_year.toPandas()
movies_pandas.to_csv("movies_pandas.csv")
Пример #12
0
import spark_functions as sf
from pyspark.sql.functions import udf, StringType

if __name__ == '__main__':

    table_to_read = 'unified_hour_view'
    table_to_write = 'unified_day'

    sf.initialize_environment()

    new_sql_context = sf.create_sql_context('spark_hour_to_day')
    new_sql_context.addPyFile('./spark_functions.py')
    udf_aggregator = udf(sf.hour_aggregator, StringType())

    hour_table = sf.spark_unified_aggregator(
        udf_aggregator, sf.load_and_get_table_df(new_sql_context,
                                                 table_to_read))
    sf.write_df_to_table(hour_table, table_to_write)
       "%s (%d incidents)" % (mostOccurrencesPremise["PREMISE TYPE"], mostOccurrencesPremise["Total"]))))

showCrime  = CrimeOccurrencesDF.orderBy(col("Total").desc()).show()


# ### 4.3 Location Columns Analysis

# In[7]:


## Address Columns Anaylsis:
from IPython.display import display, Markdown
from pyspark.sql.functions import StringType

## Casting ZIP Column into a string.
crime_DF.withColumn("ZIP", col("ZIP").cast(StringType())).printSchema()

print("Checking for nulls on columns 100 BLOCK ADDR, ZIP:")
crime_DF.select([count(when(col(c).isNull(), c)).alias(c) for c in ["ZIP","100 BLOCK ADDR"]]).show()

print("Checking amount of distinct values in columns 100 BLOCK ADDR, ZIP:")
crime_DF.select([countDistinct(c).alias(c) for c in ["ZIP","100 BLOCK ADDR"]]).show()

print ("Most and least frequent occurrences for ZIP and 100 BLOCK ADDR:")
ZIPOccurrencesDF = crime_DF.groupBy("ZIP").agg(count(lit(1)).alias("Total"))
ADDRDF = crime_DF.groupBy("100 BLOCK ADDR").agg(count(lit(1)).alias("Total"))

print("ZIP codes with the LEAST amount of crimes:")
leastOccurrencesZIP    = ZIPOccurrencesDF.orderBy(col("Total").asc()).show(10)
print("ZIP codes with the MOST amount of crimes:")
mostOccurrencesZIP     = ZIPOccurrencesDF.orderBy(col("Total").desc()).show(10)
Пример #14
0
data = spark.read.format('com.databricks.spark.csv').option(
    'header', 'true').load('data0201.csv')
from pyspark.sql.functions import udf, StringType


def zerofill(x):
    return x.zfill(2)


strzfill = udf(zerofill, StringType())

data_0 = data.withColumn('month',strzfill(data['month']))\
.withColumn('day',strzfill(data['day']))\
.withColumn('hour',strzfill(data['hour']))\
.withColumn('year',lit('2019'))

# 数据合并
from pyspark.sql.functions import to_timestamp
data_1 = data_0.withColumn(
    'dptime0', concat_ws('-', data_0['year'], data_0['month'], data_0['day']))
data_2 = data_1.withColumn('dptime1',
                           concat_ws(' ', data_1['dptime0'], data_1['hour']))
data_2 = data_2.withColumn('tail', lit('00:00'))
data_3 = data_2.withColumn('dptime2',
                           concat_ws(':', data_2['hour'], data_2['tail']))
data_use = data_3.withColumn('dptime',to_timestamp(data_3['dptime2']))\
.drop('dptime0','dptime1', 'dptime2')

# 关联旅客
data_use.createOrReplaceTempView('datas')
path_distc = spark.sql(
Пример #15
0
def map_tag(app2tag):
    def mapping(col):
        tag = app2tag.get(col)
        return tag

    return udf(mapping, StringType())
Пример #16
0
        "County": County,
        "Address": Address,
        "GPSLatitude": GPSLatitude,
        "GPSLongitude": GPSLongitude,
        "PlaceID": PlaceID,
        "MapURL": MapURL,
        "PartialMatch": PartialMatch,
        "LocationType": LocationType,
        "FormattedAddress": FormattedAddress,
        "ErrorMessage": ErrorMessage,
        "jsonResults": jsonResults
    }
    return json.dumps(returnJson)


geocodeUDF = udf(lambda z: geocodeRequest(z), StringType())


def truncLatLng(latLng):
    if latLng == None:
        return latLng
    else:
        numInt = int(latLng)
        numLen = len(str(numInt))
        if latLng >= 0:
            res = round(latLng, 6 - numLen)
        else:
            res = round(latLng, 7 - numLen)
        return str(res).rstrip('0').rstrip('.')

Пример #17
0
    data_dir = sc.wholeTextFiles(",".join(li))
    data_dir = data_dir.toDF()
    data_dir = data_dir.dropDuplicates()
    if index == 0:
        data = data_dir
    else:
        data = data.union(data_dir)

data = spark.read.format('parquet').options(
    headers=True).load('Downloads/enron_correct')
data = data.withColumnRenamed("_1", "location")
data = data.withColumnRenamed("_2", "message")
data = data.withColumn("user", split(col("location"), "/").getItem(5))
data = data.withColumn("message", regexp_replace("message", "N", "_n"))

ToEmailParses = udf(lambda z: ToEmailParse(z), StringType())
spark.udf.register("ToEmailParse", ToEmailParses)


def ToEmailParse(s):
    msg = email.message_from_string(r'{}'.format(s))
    return msg['to']


data = data.withColumn('to', ToEmailParses('message'))

FromEmailParses = udf(lambda z: FromEmailParse(z), StringType())
spark.udf.register("FromEmailParse", FromEmailParses)


def FromEmailParse(s):