def process_rdd(time: time_, rdd: RDD) -> None: if rdd.isEmpty(): return else: logging.info("----------- %s -----------" % str(time)) sql_context = get_sql_context_instance(rdd.context) tweets_df = sql_context.createDataFrame(rdd, StringType()) tweets_df.write.json( f"""s3a://jconf-2020/bronze/{time_.strftime("%Y-%m-%d")}/{reverse_current_time_millis()}""" )
def __preprocessRdd(self, rdd: RDD): rddc = rddCorrector() rdd = rdd.map(lambda l: rddc.correct(l)) if rdd != None: if (rdd.isEmpty() == False): rdd = rdd.map(lambda l: l.replace("<tweet>", "")) rdd = rdd.map(lambda l: l.replace("</tweet>", "")) df = DataFrameWorks().convertDataFrame(rdd, self.__spark) df = CleanText().clean(df, self.__spark) return df return None
def __convert_service_format(rdd: RDD) -> RDD: if rdd.isEmpty(): return rdd df = rdd.toDF() df = add_neighborhoods(df) df = df.withColumn("row_id", hasher(df["row_id"])) \ .withColumn("category_id", hasher(df["category"])) \ .withColumn("opened", unix_timestamp(to_timestamp("opened", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast( IntegerType())) \ .withColumn("report_datetime", unix_timestamp(to_timestamp("report_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast( IntegerType())) \ .withColumn("neighborhood_id", hasher(df["neighborhood"])) return df.rdd
def __convert_service_format(rdd: RDD) -> RDD: if rdd.isEmpty(): return rdd df = rdd.toDF() # Find neighborhoods from lat/lon # This is necessary, because a lot of the data from the API is missing neighborhood data df = add_neighborhoods(df) # Add key data and parse dates df = df.withColumn("category_id", hasher("category")) \ .withColumn("neighborhood_id", hasher("neighborhood")) \ .withColumn("opened", unix_timestamp(to_timestamp("openedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \ .withColumn("updated", unix_timestamp(to_timestamp("updatedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \ .drop("openedStr", "updatedStr") return df.rdd