Пример #1
0
    def occCalc(self, channelID, testing=False):
        """ Calculates occupancy for the user defined month
		"""
        if type(channelID) != list:
            raise TypeError('ChannelID is required to be a list')

        conf = SparkConf()\
          .setAppName("Occupancy Calc")\
          .set("spark.master", "local[*]")\
          .set("spark.driver.maxResultSize", "15G")
        sc = SparkContext(conf=conf)
        sql = SQLContext(sc)
        path = 'AZURE PATH' + self.month +\
          '/*/*/' + self.sensor + '*'
        data = sql.read.parquet(path)

        timeCount = data.select('scan_time').distinct().count()
        timeCount = sc.broadcast(timeCount)
        subData = data.select('scan_time', 'channel_id', 'power_dbm').filter(
            data.channel_id.isin(channelID))
        subData = subData.groupBy('channel_id').agg(
            (count(column('power_dbm')) / timeCount.value).alias('freq'),
            stddev(column('power_dbm')).alias('sd')).sort(
                asc('freq'), desc('sd'))

        if testing:
            subData.toPandas().to_csv('C:/path/freq.csv', sep='\t')
            sc.stop()
        else:
            sc.stop()
            return (subData.toPandas())
Пример #2
0
    def identify(log):
        """Identify

        Append next action and timespan to each record and remove timeout.

        Arguments:
            log:    spark dataframe contains the original log

        Returns:
            spark dataframe of log with next action and timespan in
        """
        # window lag
        win = Window.partitionBy(LogFile.usrid).orderBy(
            column(LogFile.stamp).desc()
        )

        log = log.withColumn("next_", lag(LogFile.event).over(win))
        log = log.withColumn("next_stamp", lag(LogFile.stamp).over(win))
        log = log.na.drop()

        # timespan
        TIME_FMT = "yyyy-MM-dd HH:mm:ss"
        TIME_DIF = unix_timestamp(
            "next_stamp", format=TIME_FMT
        ) - unix_timestamp(LogFile.stamp, format=TIME_FMT)
        log = log.withColumn("restm", TIME_DIF)

        # time out
        log = log.filter(column("restm") <= 1800)

        # formatting
        return log.select(
            LogFile.stamp, LogFile.usrid, LogFile.event, "next_", "restm"
        )
Пример #3
0
def main(args):
    spark = sql.SparkSession.builder.appName('update-mutator').getOrCreate()

    msg_struct = types.StructType([
        types.StructField('text', types.StringType(), True),
        types.StructField('user_id', types.StringType(), True),
        types.StructField('update_id', types.StringType(), True)
    ])

    sentiments_struct = types.ArrayType(
        types.MapType(types.StringType(), types.FloatType(), False))

    analyzer = vader.SentimentIntensityAnalyzer()
    analyzer_bcast = spark.sparkContext.broadcast(analyzer)

    def sentiment_generator_impl(text):
        va = analyzer_bcast.value
        english = SpacyMagic.get('en_core_web_sm')
        result = english(text)
        sents = [str(sent) for sent in result.sents]
        sentiment = [va.polarity_scores(str(s)) for s in sents]
        return sentiment

    sentiment_generator = functions.udf(sentiment_generator_impl,
                                        sentiments_struct)

    def json_converter_impl(user_id, update_id, text, sentiments):
        obj = dict(user_id=user_id,
                   update_id=update_id,
                   text=text,
                   sentiments=sentiments)
        return json.dumps(obj)

    json_converter = functions.udf(json_converter_impl, types.StringType())

    records = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('subscribe', args.intopic).load().select(
            functions.column('value').cast(types.StringType()).alias('value')
        ).select(
            functions.from_json(
                functions.column('value'), msg_struct).alias('json')).select(
                    functions.column('json.user_id'),
                    functions.column('json.update_id'),
                    functions.column('json.text'),
                    sentiment_generator(functions.column('json.text')).alias(
                        'sentiments')).select(
                            json_converter(functions.column('user_id'),
                                           functions.column('update_id'),
                                           functions.column('text'),
                                           functions.column('sentiments')).
                            alias('value')).writeStream.format('kafka').option(
                                'kafka.bootstrap.servers',
                                args.brokers).option('topic',
                                                     args.outtopic).option(
                                                         'checkpointLocation',
                                                         '/tmp').start())

    records.awaitTermination()
def main(args):
    """Configure and start the Kafka stream processor"""
    # acquire a SparkSession object
    spark = (
        sql.SparkSession.builder.appName('kafka-spark-python').getOrCreate())

    # if a user function is specified, download it and import it
    if args.userfunction is not None:
        try:
            logging.info('downloading user function')
            logging.info(args.userfunction)
            dl = urllib.urlretrieve(args.userfunction)[0]
            loader = importlib.SourceFileLoader('userfunction', dl)
            userfunction = pytypes.ModuleType(loader.name)
            loader.exec_module(userfunction)
            user_function = functions.udf(userfunction.user_defined_function,
                                          types.StringType())
            logging.info('user function loaded')
        except Exception as e:
            logging.error('failed to import user function file')
            logging.error(e)
            user_function = None

    # configure the operations to read the input topic
    records = (
        spark.readStream.format('kafka').option(
            'kafka.bootstrap.servers',
            args.brokers).option('subscribe', args.intopic).load().select(
                functions.column('value').cast(
                    types.StringType()).alias('value'))
        # add your data operations here, the raw message is passed along as
        # the alias `value`.
        #
        # for example, to process the message as json and create the
        # corresponding objects you could do the following:
        #
        # .select(
        #     functions.from_json(
        #         functions.column('value'), msg_struct).alias('json'))
        #
        # the following operations would then access the object and its
        # properties using the name `json`.
    )

    # if it exists, add the user function to the stream pipeline
    if user_function is not None:
        records = (records.select(
            user_function(functions.column('value')).alias('value')).where(
                'value is not null'))

    # configure the output stream
    writer = (records.writeStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('topic',
                             args.outtopic).option('checkpointLocation',
                                                   '/tmp').start())

    # begin processing the input and output topics
    writer.awaitTermination()
Пример #5
0
    def _load_data(self):
        database = SparkDatabase()
        log = database.load("log")

        user = log.groupby("usrid").count().filter(column("count") < 100)
        log = log.join(user, "usrid", "leftanti")

        return log.filter(column("event").isin(self.events))
Пример #6
0
def main():
    df = spark.read.text('/var/tmp/coursera-data/final/en_US/en_US.news.txt')
    # df = spark.createDataFrame(range(int(1e6)), IntegerType())
    data_out_scala = (df.withColumn('sentences', do_split('value')).select(
        explode(column('sentences')).alias('sentence')).withColumn(
            'tokens', do_tokenise('sentence')).select(explode(
                column('tokens'))).groupBy('col').count().orderBy(
                    'count', ascending=False))
    data_out_scala.show()
Пример #7
0
    def collect_data(self):
        """Collect user behavior dataframe"""
        log = self._load_data()
        agg = log.groupby("usrid", "event").agg(avg("restm").alias("avgrt"))

        data = []
        for i, e in enumerate(self.events):
            t = agg.filter(column("event") == e)
            t = t.withColumn("avgrt", round("avgrt", 2))
            data.append(
                t.select(column("usrid"),
                         column("avgrt").alias(str(i))))
        return reduce(lambda x, y: x.join(y, ["usrid"], how="full"), data)
Пример #8
0
    def load(self):
        csv = spark.read.csv(self.path, header=True)

        # merge date time
        datetime = concat(column("date"), lit(" "), column("time"))
        tmp = csv.withColumn("datetime", datetime.cast(TimestampType()))

        # format column name
        log = tmp.select(
            column("datetime").alias(self.stamp),
            column("ip").alias(self.usrid),
            column("extention").alias(self.event),
        )

        return log
Пример #9
0
 def test_using_select_expr(self):
     df = self.df.select(expr("DEST_COUNTRY_NAME"),
                         col("DEST_COUNTRY_NAME"),
                         column("DEST_COUNTRY_NAME"))
     self.assertListEqual(
         df.columns,
         ['DEST_COUNTRY_NAME', 'DEST_COUNTRY_NAME', 'DEST_COUNTRY_NAME'])
Пример #10
0
def convert_column(df, col_name, col_type_str):
    new_col = col_name + "_tmp"
    df = df.withColumn(new_col,
                       F.column(col_name).cast(col_type_str)).drop(
                           col_name).withColumnRenamed(new_col,
                                                       col_name).drop(new_col)
    return df
Пример #11
0
def main(args):
    spark = sql.SparkSession.builder.appName('update-analyzer').getOrCreate()

    msg_struct = types.StructType([
        types.StructField('text', types.StringType(), True),
        types.StructField('user_id', types.StringType(), True),
        types.StructField('update_id', types.StringType(), True)
    ])

    analyzer = vader.SentimentIntensityAnalyzer()
    analyzer_bcast = spark.sparkContext.broadcast(analyzer)
    vhost_bcast = args.vhost
    vport_bcast = args.vport

    def sentiment_generator_impl(text, user_id, update_id):
        va = analyzer_bcast.value
        english = SpacyMagic.get('en_core_web_sm')
        result = english(text)
        sents = [str(sent) for sent in result.sents]
        sentiments = [va.polarity_scores(str(s)) for s in sents]
        obj = dict(user_id=user_id,
                   update_id=update_id,
                   text=text,
                   sentiments=sentiments)
        try:
            con = httplib.HTTPConnection(host=vhost_bcast, port=vport_bcast)
            con.request('POST', '/', body=json.dumps(obj))
            con.close()
        except Exception as e:
            logging.warn('unable to POST to visualizer, error:')
            logging.warn(e.message)

    sentiment_generator = functions.udf(sentiment_generator_impl,
                                        types.NullType())

    records = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('subscribe', args.topic).load().select(
            functions.column('value').cast(
                types.StringType()).alias('value')).select(
                    functions.from_json(
                        functions.column('value'),
                        msg_struct).alias('json')).select(
                            functions.column('json.user_id'),
                            functions.column('json.update_id'),
                            functions.column('json.text'),
                            sentiment_generator(
                                functions.column('json.text'),
                                functions.column('json.user_id'),
                                functions.column('json.update_id'))).
               writeStream.format("console").start())

    records.awaitTermination()
Пример #12
0
 def getitem(self, file_path):
     # read from json file, explode the nested hirearchy, and form as a DataFrame
     js_task = ReadJsonFile(file_path)
     uiid = file_path.split('/')[1].split('.')[0]
     js_df = js_task.load()
     new_rdd = js_df.rdd.map(lambda row: row.asDict(True))
     d = new_rdd.collect()[0]
     elements = []
     self.iterdict(d, elements)
     # deal with empty layouts
     try:
         element_df = spark.createDataFrame(Row(**x) for x in elements)
         clean_df = element_df.select(
             column('bounds'),
             column('componentLabel').alias('componentlabel'))
         self.df = clean_df.withColumn('uiid', lit(uiid))
     except:
         self.df = None
     return self.df
Пример #13
0
def train_val_test_split(spark, dirname, random_seed):

    # Read in downsampled interactions
    interactions = spark.read.parquet(
        f'{dirname}/{dirname}_subsamples.parquet')

    interactions = interactions.filter(interactions.rating > 0).drop(
        'is_read', 'is_reviewed')

    # Find all unique user ids with interactions number > 10
    userids = interactions \
        .groupby('user_id') \
        .count().alias('count') \
        .filter(column('count') > 10) \
        .select('user_id')

    # Sample train, val, test user
    train_user = userids.sample(False, TRAIN_FRAC, rand_seed)
    remaining_user = userids.subtract(train_user)
    val_user = remaining_user.sample(False, VAL_FRAC / (1 - TRAIN_FRAC),
                                     rand_seed)
    test_user = remaining_user.subtract(val_user)

    # Construct train, val, test interactions
    train_all = train_user.join(interactions, on='user_id', how='inner')
    val_all = val_user.join(interactions, on='user_id', how='inner')
    test_all = test_user.join(interactions, on='user_id', how='inner')

    # Split val and test in half
    window = Window.partitionBy('user_id').orderBy('book_id')

    val_interactions = (val_all.select(
        "user_id", "book_id", "rating",
        row_number().over(window).alias("row_number")))
    test_interactions = (test_all.select(
        "user_id", "book_id", "rating",
        row_number().over(window).alias("row_number")))

    val_add2train = val_interactions.filter(val_interactions.row_number %
                                            2 == 0).drop('row_number')
    val = val_interactions.filter(val_interactions.row_number %
                                  2 == 1).drop('row_number')

    test_add2train = test_interactions.filter(test_interactions.row_number %
                                              2 == 1).drop('row_number')
    test = test_interactions.filter(test_interactions.row_number %
                                    2 == 0).drop('row_number')

    # Add half val and half test back to train
    train = train_all.union(val_add2train).union(test_add2train)

    # Write train_set, val_set, test_set out
    train.write.mode('overwrite').parquet(f'{dirname}/train.parquet')
    val.write.mode('overwrite').parquet(f'{dirname}/val.parquet')
    test.write.mode('overwrite').parquet(f'{dirname}/test.parquet')
Пример #14
0
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed):

    val_set = spark.read.parquet(f'{dirname}/val.parquet')

    print(
        f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...'
    )

    # load corresponding trained model
    model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model')

    # computing RMSE on validation set
    predictions = model.transform(val_set)
    evaluator = RegressionEvaluator(metricName='rmse',
                                    labelCol='rating',
                                    predictionCol='prediction')
    rmse = evaluator.evaluate(predictions)

    print(f'rmse: {rmse}')

    print(f'Constructing top {k} books recommended to per user ...')
    val_users = val_set.select('user_id').distinct()

    start_time = time.time()

    perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k)

    myudf = udf(extract_item, ArrayType(IntegerType()))
    perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn(
        'predictions',
        myudf(perUserPredictedTopKItemsDF['recommendations'])).drop(
            'recommendations')

    print('Constructing actual books per user ...')
    perUserActualItemsDF = val_set.filter(
        column('rating') >= 3.0).groupBy('user_id').agg(
            expr('collect_list(book_id) as book_ids'))

    print('Constructing Ranking Metrics ...')
    perUserItemsRDD = perUserPredictedTopKItemsDF.join(
        perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(perUserItemsRDD)

    precisionAtK = rankingMetrics.precisionAt(k)
    mAP = rankingMetrics.meanAveragePrecision

    end_time = time.time()
    time_delta = str(datetime.timedelta(seconds=end_time - start_time))

    print(f'p@{k}: {precisionAtK}')
    print(f'mAP: {mAP}')
    print(f'run time: {time_delta}')
Пример #15
0
    def view(self, data, pred):
        """Use PCA to reduce dimension and visualize the data"""
        pca = PCA(k=3, inputCol="scaled", outputCol="pca-3")
        model = pca.fit(data)
        transformed = model.transform(data)
        view = (transformed.select("prediction", "pca-3").withColumn(
            "axis", self.to_array(
                column("pca-3"))).select(["prediction"] +
                                         [column("axis")[i]
                                          for i in range(3)]))

        dataframe = view.toPandas()
        fig = pyplot.figure(figsize=(20, 20))
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(
            dataframe.iloc[:, 1],
            dataframe.iloc[:, 2],
            dataframe.iloc[:, 3],
            c=dataframe.iloc[:, 0] + 2,
        )
        pyplot.show()
Пример #16
0
def main(Number, Query, File, write=False):
    print("\nSetting up the enviroment\n")
    conf = SparkConf().setAppName("Searcher")  # configure Spark
    sc = SparkContext(
        conf=conf)  # start Spark Context with the specific configuration
    sql = SQLContext(sc)  # start Spark SQL

    print("\nLoading the data\n")
    data = sql.read.load(File)

    print("\nQuerying the keywords in the database\n")
    totKeyword = len(Query)
    filtered = data.filter(
        column('word').isin([word.lower() for word in Query
                             ]))  # Query the database based on request

    sumed = filtered.groupby(filtered.location).agg(
        sum('tfIdf').alias("tot"))  # Sum the TFIDF scores

    counted = filtered.groupby(filtered.location).count()
    counted = counted.select(
        counted.location.alias("loc"),
        (column("count") /
         totKeyword).alias("freq"))  # determine the weight for each word

    result = sumed.join(counted, on=sumed.location == counted.loc,
                        how="inner")  # join the tables
    result = result.select(
        result.location,
        (column("tot") * column("freq")).alias("score")).orderBy(
            desc("score")).limit(
                Number)  # Calculate score and return top N values

    if write:
        print("\nWriting the data\n")
        result.write.format('com.databricks.spark.csv').save('query_' +
                                                             ''.join(Query),
                                                             header='true')
    else:
        result.show()
Пример #17
0
def main():
    conf = SparkConf().setAppName("Index Builder")  # configure Spark
    sc = SparkContext(
        conf=conf)  # start Spark Context with the specific configuration
    sql = SQLContext(sc)  # start Spark SQL

    text = sc.wholeTextFiles(
        "/user/root/bbcsport/*")  # fuzy read: Reads all files under bbcsport
    fileCount = text.count()
    # reformat data to make it cleaner and break text into words
    cleaned = text.map(lambda file: ("%s/%s" % (file[0].split("/")[len(file[0].split("/"))-2],\
                                        file[0].split("/")[len(file[0].split("/"))-1]), file[1].lower().split()))\
                  .map(lambda file: (file[0], [re.sub(r'[^\w\s]', '', word) for word in file[1]]))
    # regex cleaning from: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python

    cleanedDF = cleaned.toDF(["location", "words"])  # create dataframe
    cleanedDF = cleanedDF.select(
        cleanedDF.location,
        explode(cleanedDF.words).alias("word"))  # Flatten the list of words

    tfMap = cleanedDF.groupby(
        cleanedDF.location,
        cleanedDF.word).count()  #Count occurences of a word in a document
    tfReduce = tfMap.groupby(tfMap.location, tfMap.word).agg(
        sum("count").alias("tf"))  # Calculate TF

    idfMap = cleanedDF.distinct().groupby(
        cleanedDF.word).count()  # count whether a word occured in a document
    idfReduce = idfMap.select(
        idfMap.word,
        log(fileCount / (column("count"))).alias("idf"))  # Calculate IDF

    joinTfIdf = tfReduce.join(idfReduce, tfReduce.word == idfReduce.word,
                              "inner")  # Join TF & IDF tables
    tfIdf = joinTfIdf.select(column("location"), tfReduce["word"],
                             (column("tf") *
                              column("idf")).alias("tfIdf"))  # Calc. TFIDF

    tfIdf.write.parquet(
        'bbc.parquet')  # write file in an efficient file format
def down_sample(data, ratio_adjust=1.2):
    counts = (data.select('stars').groupBy('stars').count().orderBy(
        F.column("count"), ascending=False).collect())
    higher_bound = counts[0][1]
    lower_bound = counts[1][1]
    rand_gen = lambda x: randint(0, higher_bound) if x == "true" else -1
    udf_rand_gen = F.udf(rand_gen, IntegerType())
    threshold_to_filter = int(ratio_adjust * float(lower_bound) /
                              higher_bound * higher_bound)
    data = data.withColumn("randIndex", udf_rand_gen("stars"))
    sampled_training_data = data.filter(
        data['randIndex'] < threshold_to_filter)
    return sampled_training_data.drop('randIndex')
Пример #19
0
 def preprocess(self):
     # read from csv file, load as DataFrame
     csv_task = ReadCSVFile(self.path)
     orig = csv_task.load()
     self.orig = orig
     # rename columns
     self.csv = orig.select(
         column('App Package Name').alias('appname'),
         column('Play Store Name').alias('name'),
         column('Category').alias('category'),
         column('Average Rating').alias('rating'),
         column('Number of Ratings').alias('ratenum'),
         column('Number of Downloads').alias('dlnum'),
         column('Date Updated').alias('update'),
         column('Icon URL').alias('url'))
Пример #20
0
    def cluster(self, data):
        """Run k-means to cluster user into 2 groups"""
        for col in data.schema.names[2:]:
            data = data.withColumn(col, column("0") / column(col))
        data = data.dropna()

        vecAssembler = VectorAssembler(inputCols=data.schema.names[1:],
                                       outputCol="features")
        data = vecAssembler.transform(data)

        scaler = StandardScaler(
            inputCol="features",
            outputCol="scaled",
            withStd=True,
            withMean=True,
        )
        scalerModel = scaler.fit(data)
        data = scalerModel.transform(data)

        kmeans = KMeans().setK(2).setFeaturesCol("scaled").setSeed(666)
        model = kmeans.fit(data)
        predictions = model.transform(data)

        return predictions.select("usrid", "prediction")
Пример #21
0
def average_distance(co_ordinates_df):

    # sorting done on basis of time provided and related lat, lng captured
    window = Window.partitionBy(f.col('id')).orderBy(f.col('time'))
    sorted_df = co_ordinates_df.withColumn('lat_lng_list',
                                           f.collect_list("coordinate").over(window))
    sorted_df = sorted_df.groupBy(f.col('id')) \
        .agg(f.max('lat_lng_list').alias('lat_lng_list'))

    from pyspark.sql.functions import udf

    # Udf is used to pass list of lat, lng values and get average distance
    avg_udf = udf(calculate_average)

    result_df = sorted_df.withColumn('avg_dist',
                                     avg_udf(f.column('lat_lng_list')))

    result_df = result_df.select('id', 'avg_dist')

    return result_df
def shortest_path(v_from, v_to, df, max_path_length=10):
    """
        v_from - исходная вершина
        v_to - целевая вершина
        df - Spark DataFrame с ребрами графа
        max_path_length - максимальная длина пути
        
        Возвращает: pyspark.sql.DataFrame, состоящий из одного столбца с найдеными путями
    """
    temp_df = df.filter(df.follower_id == v_from)
    temp_df = temp_df.select(
        f.col('user_id').alias('last_neighbour'),
        f.col('follower_id').alias('path'))

    for i in range(max_path_length):
        if temp_df.filter(temp_df.last_neighbour.isin(v_to)).count() > 0:
            result_df = temp_df.filter(temp_df.last_neighbour.isin(v_to))\
                               .select(f.concat('path', f.lit(','), 'last_neighbour').alias('path'))
            return result_df
        temp_df = temp_df.join(df, temp_df.last_neighbour==df.follower_id, how="inner",)\
                         .select(f.column('user_id').alias('last_neighbour'),
                                 f.concat('path', f.lit(','), 'last_neighbour').alias('path'))
Пример #23
0
def _keys(r: Type[Relation], data: Union["pyspark.sql.DataFrame",
                                         "pandas.DataFrame"]):
    if is_pyspark_df(data, r):
        from pyspark.sql.functions import column, count

        duplicated_rows = (data.groupby(r.get_key_field_names()).agg(
            count("*").alias("count")).filter(column("count") > 1).count())

        if duplicated_rows > 0:
            raise ValueError(f"Key error for '{r.name()}': "
                             f"using keys '{r.get_key_field_names()}'"
                             f" there are {duplicated_rows} duplicates.")

    elif is_pandas_df(data, r):
        duplicated = data[r.get_key_field_names()].duplicated()
        duplicated_rows = len(data[duplicated])

        if duplicated_rows > 0:
            raise ValueError(f"Key error for '{r.name()}': "
                             f"using keys '{r.get_key_field_names()}'"
                             f" there are {duplicated_rows} duplicates.")

    logger.info(f"Relation {r.name()} has passed the key unqiue-ness check.")
Пример #24
0
 def preprocess(self):
     # read from csv file, load as DataFrame
     csv_task = ReadCSVFile(self.path)
     imgfolder = csv_task.img_path()
     file = open('localdata/train_list', 'rb')
     train_list = pickle.load(file)
     train_id_list = [s.split('/')[1].split('.')[0] for s in train_list]
     file.close()
     orig = csv_task.load()
     self.orig = orig
     # rename columns
     csv_df = orig.select(
         column('UI Number').alias('uiid'),
         column('App Package Name').alias('appname'),
         column('Interaction Trace Number').alias('trace'),
         column('UI Number in Trace').alias('uicount')).where(
             column("uiid").isin(train_id_list))
     writepath_udf = udf(lambda uiid: imgfolder + uiid + '.png',
                         StringType())
     self.csv = csv_df.withColumn("path", writepath_udf(column("uiid")))
     self.num = self.csv.count()
Пример #25
0
    # this is just the treebank tokenizer
    return [word for word in t.tokenize(s) if word not in stopwords_set]

udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType()))


(idb_df
    .select(sql.concat_ws(" ", idb_df["data.dwc:occurrenceRemarks"],
                          idb_df["data.dwc:eventRemarks"],
                          idb_df["data.dwc:fieldNotes"]
                         )
                        .alias("note"),
                        idb_df["uuid"]
            )
    .where(sql.column("note") != "")
    .withColumn("tokens", udf_tokenize(sql.column("note")))
    .select(sql.column("uuid"),
            sql.explode(sql.column("tokens")).alias("token")
           )
    .groupBy(sql.column("uuid"), sql.column("token"))
    .count()
    .write
    .mode("overwrite")
    .parquet("/guoda/data/idigbio-{}-tf.parquet".format(idb_df_version))
)


# 4:09 on mesos1 with 96 cores
#time HADOOP_USER_NAME=hdfs spark-submit --master mesos://mesos01.acis.ufl.edu:5050 --executor-memory 20G --driver-memory 10G --total-executor-cores 96 idb_tf_index.py
Пример #26
0
    with the nltk library
    '''

    # word_tokenize uses PunktSentenceTokenizer first, then
    # treebank_word_tokenizer on those so can get nested
    # lists.
    #return nltk.tokenize.word_tokenize(s)

    # this is just the treebank tokenizer
    return [word for word in t.tokenize(s) if word not in stopwords_set]

udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType()))


(bhl_df
    .withColumn("tokens", udf_tokenize(sql.column("ocrtext")))
    .select(sql.column("itemid"),
            sql.explode(sql.column("tokens")).alias("token")
           )
    .groupBy(sql.column("itemid"), sql.column("token"))
    .count()
    .write
    .mode("overwrite")
    .parquet("/guoda/data/bhl-{}-tf.parquet".format(bhl_df_version))
)


# 4:09 on mesos1 with 96 cores
#time HADOOP_USER_NAME=hdfs spark-submit --master mesos://mesos01.acis.ufl.edu:5050 --executor-memory 20G --driver-memory 10G --total-executor-cores 96 bhl_tf_index.py

# still hangs
Пример #27
0
df = spark.read.format("json").schema(myManualSchema)\
    .load("/data/flight-data/json/2015-summary.json")

# in Python
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

# in Python
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

# select columns in different ways
from pyspark.sql.functions import expr, col, column
df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME"))\
  .show(2)

df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME"))

df.selectExpr(
    "*", # all original columns
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
.show(2)

df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

# we need to pass explicit values into Spark that are just a value (rather than a new
# column).
from pyspark.sql.functions import lit
Пример #28
0
""" 基本PySpark語法 : 透過PySpark Shell"""
from pyspark.sql.types improt *
sc = SparkContext() # PySpark入口
sqlContext = SQLContext(sc)
df = sqlContext.read.format("json").load("./spark-2.4.4-bin-hadoop2.7/examples/src/main/resources/employees.json")

"""查看Schema"""
df.printSchema() 

"""建構欄位"""
from pyspark.sql.functions import col, column, expr
col("col1")
column("col2")

"""查看欄位"""
df.columns

"""獲取第一條row"""
df.first()

"""查看數據類型"""
df.select("salary").dtypes


"""
    建立DataFrame
        方法: createOrReplaceTempView
        說明:存在就替換,不存在就創建
"""

df.createOrReplaceTempView("dfTable")
Пример #29
0
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello": "world"})
])

df=spark.read.format("json").schema(myManualSchema)\
  .load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json")

#컬럼 생성
from pyspark.sql.functions import col, column

col("NewColumn")
column("NewColumn")
#Catalog가 분석을 실행하기 전까지 확인X

#DAG를 나타내는 예제
from pyspark.sql.functions import expr
expr("(((someCol+5)*200)-6)<otherCol")

#DataFrame의 column에 접근
spark.read.format("json").load(
    "/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json"
).columns

#Row 객체 확인
df.first()

#Row 객체 생성
Пример #30
0
spark = SparkSession.builder.master("local").appName("Testing") \
                    .config("spark.some.config.option", "some-value") \
                    .getOrCreate()

df_1 = create_frame(spark)
df_2 = df_1.withColumnRenamed("COL_1", "COL_2").withColumnRenamed("PKEY_1", "PKEY_2") \
           .withColumnRenamed("FKEY_1", "FKEY_2").withColumnRenamed("NAME_1", "NAME_2")
df_3 = df_1.withColumnRenamed("COL_1", "COL_3").withColumnRenamed("PKEY_1", "PKEY_3") \
           .withColumnRenamed("FKEY_1", "FKEY_3").withColumnRenamed("NAME_1", "NAME_3")
df_4 = df_1.withColumnRenamed("COL_1", "COL_4").withColumnRenamed("PKEY_1", "PKEY_4") \
           .withColumnRenamed("FKEY_1", "FKEY_4").withColumnRenamed("NAME_1", "NAME_4")
df_5 = df_1.withColumnRenamed("COL_1", "COL_5").withColumnRenamed("PKEY_1", "PKEY_5") \
           .withColumnRenamed("FKEY_1", "FKEY_5").withColumnRenamed("NAME_1", "NAME_5")
df_6 = df_1.withColumnRenamed("COL_1", "COL_6").withColumnRenamed("PKEY_1", "PKEY_6") \
           .withColumnRenamed("FKEY_1", "FKEY_6").withColumnRenamed("NAME_1", "NAME_6")
df_7 = df_1.withColumnRenamed("COL_1", "COL_7").withColumnRenamed("PKEY_1", "PKEY_7") \
           .withColumnRenamed("FKEY_1", "FKEY_7").withColumnRenamed("NAME_1", "NAME_7")

join_1 = (F.column("FKEY_2") == F.column("PKEY_1"))
join_2 = (F.column("FKEY_3") == F.column("PKEY_2"))
join_3 = (F.column("FKEY_4") == F.column("PKEY_3"))
join_4 = (F.column("FKEY_5") == F.column("PKEY_4"))
join_5 = (F.column("FKEY_6") == F.column("PKEY_5"))

result = df_1.filter("FKEY_1 is null").join(df_2, join_1, "left")
result = result.join(df_3, join_2, "left")
result = result.join(df_4, join_3, "left")
result = result.join(df_5, join_4, "left")

result.show()
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello":"world"})
])
df = spark.read.format("json").schema(myManualSchema)\
  .load("/data/flight-data/json/2015-summary.json")


# COMMAND ----------

from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")


# COMMAND ----------

from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")


# COMMAND ----------

from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)


# COMMAND ----------
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello": "world"})
])
df = spark.read.format("json").schema(myManualSchema)\
  .load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json")

# COMMAND ----------

from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")

# COMMAND ----------

from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

# COMMAND ----------

from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

# COMMAND ----------

myRow[0]
myRow[2]
Пример #33
0
        app_df, ui_df = prefilter(db, test_input)
    else:
        app_df = db.load(table='app')
        ui_df = db.load(table='ui')
    return app_df, ui_df

if __name__ == '__main__':
    db = Database()
    # load in a test sample
    file = open('localdata/test_ui', 'rb')
    test_ui = pickle.load(file)
    file.close()
    # prefiltering by category
    time_start=time.time()
    app_df, ui_df = load_from_db(db, test_ui, is_prefilter = True)
    ui_targets = ui_df.select(column('uiid'),column('appname')).collect()
    time_end=time.time()
    print('Read from sql time cost',time_end-time_start,'s')
    
    # use the ML/DL models
    time_start=time.time()
    model_demo = Model(test_ui, ui_targets)
    sim_df_pd = model_demo.cos_similarity()
    sim_app_df = ui_df.select(column('uiid'),column('appname')).where(ui_df.uiid.isin(sim_df_pd.uiid.iloc[0:6].tolist()))
    sim_app = sim_app_df.collect()
    result_df = app_df.where(app_df.appname.isin([row.appname for row in sim_app]))
    outputs = result_df.join(sim_app_df,"appname","left").collect()
    time_end=time.time()
    print('Model time cost',time_end-time_start,'s')
    
    file = open('localdata/result_demo', 'wb')