예제 #1
0
def process_rdd(time: time_, rdd: RDD) -> None:
    if rdd.isEmpty():
        return
    else:
        logging.info("----------- %s -----------" % str(time))
        sql_context = get_sql_context_instance(rdd.context)
        tweets_df = sql_context.createDataFrame(rdd, StringType())
        tweets_df.write.json(
            f"""s3a://jconf-2020/bronze/{time_.strftime("%Y-%m-%d")}/{reverse_current_time_millis()}"""
        )
예제 #2
0
 def __preprocessRdd(self, rdd: RDD):
     rddc = rddCorrector()
     rdd = rdd.map(lambda l: rddc.correct(l))
     if rdd != None:
         if (rdd.isEmpty() == False):
             rdd = rdd.map(lambda l: l.replace("<tweet>", ""))
             rdd = rdd.map(lambda l: l.replace("</tweet>", ""))
             df = DataFrameWorks().convertDataFrame(rdd, self.__spark)
             df = CleanText().clean(df, self.__spark)
             return df
     return None
예제 #3
0
    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd

        df = rdd.toDF()

        df = add_neighborhoods(df)

        df = df.withColumn("row_id", hasher(df["row_id"])) \
            .withColumn("category_id", hasher(df["category"])) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("opened", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("report_datetime",
                        unix_timestamp(to_timestamp("report_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("neighborhood_id", hasher(df["neighborhood"]))

        return df.rdd
예제 #4
0
    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd
        df = rdd.toDF()

        # Find neighborhoods from lat/lon
        # This is necessary, because a lot of the data from the API is missing neighborhood data
        df = add_neighborhoods(df)

        # Add key data and parse dates
        df = df.withColumn("category_id", hasher("category")) \
            .withColumn("neighborhood_id", hasher("neighborhood")) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("openedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .withColumn("updated",
                        unix_timestamp(to_timestamp("updatedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .drop("openedStr", "updatedStr")

        return df.rdd