def main(): spark = SparkSession.builder.getOrCreate() frame = extract_demographics(spark, catalog) chinese_demographics = frame.filter(lower(col("country")) == "china") aggregated_demographics = aggregate_inhabitants_by_province( chinese_demographics) store_chinese_demographics(aggregated_demographics, catalog)
def clean_data(self, list_words_col: str) -> pyspark.sql.DataFrame: """Pre-processing data Processing executed in function: - Lower case - Words start with letters or whitespace - Remove whitespaces into start and final words - Remove rows empty - Transform each row in list :Args: :param list_words_col: column's name of Dataframe :Returns: :return: Dataframe with a word list in each row :Samples: +--------------------+ | value| +--------------------+ |[over, in, one, n...| """ self.__df_doc = self.__df_doc \ .withColumn(list_words_col, lower(col(list_words_col))) \ .withColumn(list_words_col, regexp_replace(str=col(list_words_col), pattern='[^a-z ]', replacement='')) \ .withColumn(list_words_col, trim(col(list_words_col))) \ .filter(self.__df_doc[list_words_col] != "") \ .withColumn(list_words_col, split(list_words_col, ' ')) return self
def _get_by_keywords(self, where_filter: str): prog = re.compile(r'"(.*)"') counts = [] query = self.df for category in self.terms: if category != 'general': cat_terms = self.terms[category] if cat_terms: for term in cat_terms: match = prog.match(term) if match: print(match.groups()) like_expr = '%' + '%'.join( match.groups()).lower() + '%' else: like_expr = '%' + term.lower().replace( '%', '\%').replace(' ', '%').replace('"', '') + '%' col_name = term.lower().replace('%', '\%').replace( ' ', '_').replace('"', '') query = query.withColumn( col_name, sql_funcs.lower( sql_funcs.col('text')).like(like_expr)) counts.append( sql_funcs.sum( sql_funcs.when(sql_funcs.col(col_name), 1)).alias(col_name)) if where_filter == 'none': return json.loads(query.select(counts).toJSON().collect()[0]) else: return json.loads( query.filter(where_filter).select(counts).toJSON().collect() [0])
def get_tokenized_df(partitioned_df): #split into individual words tokenizer = Tokenizer(inputCol='body', outputCol='words_token') tokenized_but_unsplit_still_has_stop_words_and_punctuation_df = tokenizer\ .transform(partitioned_df)\ .select('topic','date_time','month','date', 'words_token') #remove stop words remover = StopWordsRemover(inputCol='words_token', outputCol='words_no_stops') tokenized_but_unsplit_still_has_punctuation_df = remover\ .transform(tokenized_but_unsplit_still_has_stop_words_and_punctuation_df)\ .select('topic','words_no_stops','date_time','month','date') #remove punctuation tokenized_but_unsplit_df = tokenized_but_unsplit_still_has_punctuation_df\ .withColumn('words_and_punct', explode('words_no_stops'))\ .select('topic','words_and_punct','date_time','month','date') df_split_has_nonletters_and_uppercase = tokenized_but_unsplit_df\ .withColumn('word', explode(split(col('words_and_punct'), '[\W_]+')))\ .select('topic','word','date_time','month','date') df_split_has_nonletters = df_split_has_nonletters_and_uppercase\ .withColumn('word', lower(col('word'))) tokenized_and_split_has_duplicates = df_split_has_nonletters\ .filter(df_split_has_nonletters['word'].rlike('[a-zA-Z]')) #duplicates dropped to ignore cases of someone using a word in the same post tokenized_df = tokenized_and_split_has_duplicates.dropDuplicates() return tokenized_df
def main(in_dir, out_dir): df = spark.read.text(in_dir, wholetext=True) df = df.select(f.explode(f.split('value', wordbreak)).alias('word')) df = df.select(f.lower('word').alias('word')) df = df.groupBy('word').agg(f.count('word').alias('count')).orderBy( ['count', 'word'], ascending=False).cache() df.write.csv(out_dir, mode='overwrite', compression='none')
def tf_idf(df, n): # Extracting terms per each row/document as a list temp_df = df.withColumn( 'terms', f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')), ' ')) # Calculating total number of words per row/document temp_df1 = temp_df.withColumn('total_num_words', f.size('terms')) # Extracting words in each documents temp_df2 = temp_df1.withColumn('token', f.explode('terms')) # Calculating tf temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({ 'token': 'count' }).withColumnRenamed('count(token)', 'occurrence').sort('_id') temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence) # Calculating df temp_df5 = temp_df4.groupBy('token').agg( f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df') # Calculating idf temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df)) # Calculating tf-idf joined_df = temp_df4.join(temp_df6, temp_df4.token == temp_df6.token).select( temp_df4.token, temp_df4._id, temp_df4.tf, temp_df6.df, temp_df6.idf) result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf) return result
def words_and_vecs_from_csv(spark, model, csv_filename): if spark: schema = StructType([ StructField("word1", StringType(), False), StructField("word2", StringType(), False) ]) df = spark.read.csv(csv_filename, header=False, schema=schema) df = df.select( lower(col("word1")).alias("word1"), lower(col("word2")).alias("word2")) rows = df.collect() words1 = [row.word1 for row in rows] words2 = [row.word2 for row in rows] as_array = udf(lambda s: [s], ArrayType(StringType(), False)) df = df.withColumn("word1", as_array("word1")).withColumn( "word2", as_array("word2")) wordvecs1 = [ row.model for row in model.transform( df.withColumnRenamed("word1", "sentence")).collect() ] wordvecs2 = [ row.model for row in model.transform( df.withColumnRenamed("word2", "sentence")).collect() ] else: words1 = [] words2 = [] with open(csv_filename, "r") as csv_file: for line in csv_file: row = line.strip("\n").split(",") words1.append(row[0]) words2.append(row[1]) wordvecs1 = [ model.wv[word1] if word1 in model.wv else zeros(model.wv.vector_size) for word1 in words1 ] wordvecs2 = [ model.wv[word2] if word2 in model.wv else zeros(model.wv.vector_size) for word2 in words2 ] return words1, words2, wordvecs1, wordvecs2
def prepare_df(path, const, words_to_index): rdd = sc.textFile(path) row = Row("review") df = rdd.map(row).toDF() # Clean text df_clean = df.select(F.lower(F.regexp_replace(F.col('review'), "n't", " n't")).alias('review')) df_clean = df_clean.select(F.lower(F.regexp_replace(F.col('review'), "[^0-9a-zA-Z\\s]", "")).alias('review')) # Tokenize text tokenizer = Tokenizer(inputCol='review', outputCol='words_token') df_words_token = tokenizer.transform(df_clean).select('words_token') df_cutted = df_words_token.withColumn('length', F.size(F.col('words_token'))) # Replace word with it's index word_udf = F.udf(lambda row: [words_to_index[w] if w in words_to_index.keys() else words_to_index["unk"] for w in row], ArrayType(IntegerType())) df_stemmed = df_cutted.withColumn('words_stemmed', word_udf(F.col('words_token'))) return df_stemmed.withColumn("class", F.lit(const))
def removePunctuation(column): no_punct = regexp_replace(column, "\p{Punct}", '') lowered = lower(no_punct) cleaned = trim(lowered) return cleaned """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
def join(): stream_df = spark.read.json('stream_cleanned') giant_df = spark.read.json('game_info_base') stream_df = stream_df.withColumn('gen_name', functions.lower(stream_df.game)) stream_df = stream_df.select('stream_id', 'gen_name').distinct() game_info = stream_df.groupBy('gen_name').count().cache() real_game_info = game_info.join( giant_df, game_info.gen_name == functions.lower(giant_df.name), 'left') real_game_info = real_game_info.orderBy(real_game_info.count.desc()) real_game_info.where(real_game_info.count < 100).count().show() real_game_info.coalesce(1).write.json('real_game_info', mode='overwrite')
def _transform(self, df): out_col = self.getOutputCol() in_col = self.getInputCol() df_lower = df.withColumn(out_col, f.lower(f.col(in_col))) return df_lower
def lower_column(self, column): try: self.spark_df = self.spark_df.withColumn('temp', f.lower(f.col(column))).drop(column) \ .withColumnRenamed('temp', column) return self.get_json_df_response() except Exception as e: print(e) return None
def lower_case_cols(data): data_dtypes = {col[0]: col[1] for col in data.dtypes} for column in data_dtypes.keys(): if data_dtypes[column] == 'string': data = data.withColumn(column, F.lower(F.col(column))) return data
def preprocess_works(df, output_path): """ 简历工作经历信息预处理 :param df: 工作经历数据,DataFrame数据类型 :param output_path: 输出文件路径 :return: """ # 预处理后输出字段 output_columns = [ "resume_id", "company_name", "industry", "position_name", "position_title", "position_category", "work_start_date", "work_end_date", "work_start_year", "work_end_year", "work_index", "salary_min", "salary_max" ] # 归一化处理公司名称,公司行业,职位名称 df = df.withColumn( "company_name", F.lower(F.trim(F.udf(normalization_company_name)(df.company_name)))) df = df.withColumn( "industry", F.lower(F.trim(F.udf(normalization_industry)(df.industry_category)))) df = df.withColumn( "position_name", F.lower(F.trim(F.udf(normalization_position_name)(df.position_title)))) df = df.withColumn( "position_category", F.lower(F.trim(F.udf(get_position_level1)(df.position_name)))) df = df.withColumn("start_time", F.udf(lambda x: format_date.sub("-", x))(df.start_time)) df = df.withColumn("end_time", F.udf(lambda x: format_date.sub("-", x))(df.end_time)) df = df.withColumn("work_start_date", F.date_format(df.start_time, "yyy-MM")) df = df.withColumn("work_end_date", F.date_format(df.end_time, "yyy-MM")) df = df.withColumn("work_start_year", F.year(df.start_time)) df = df.withColumn("work_end_year", F.year(df.end_time)) df = df.withColumn("work_index", df.index) # 过滤公司、行业、职位全为空的字段 df = df.dropna(how="all", subset=["company_name", "industry", "position_name"]) # 将简历基本信息预处理后的结果保存至指定文件 df.select(output_columns).write.mode('overwrite').save(output_path)
def clean_text(c): return c c = lower(c) c = regexp_replace(c, "^rt ", "") c = regexp_replace(c, "(https?\://)\S+", "") c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "") #c = split(c, "\\s+") tokenization... return c
def task_5(data_io, product_processed_data, word_0, word_1, word_2): # -----------------------------Column names-------------------------------- # Inputs: title_column = 'title' PATTERN = ' ' # Outputs: titleArray_column = 'titleArray' titleVector_column = 'titleVector' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ product_processed_data_output = product_processed_data.select( F.split(F.lower(F.col(title_column)), PATTERN)) word2Vec = M.feature.Word2Vec(vectorSize=16, minCount=100, numPartitions=4, inputCol="split(lower(title), )", outputCol="result", seed=5) model = word2Vec.fit(product_processed_data_output) model = word2Vec.fit(product_processed_data_output) model = word2Vec.fit(product_processed_data_output) model = word2Vec.fit(product_processed_data_output) model = word2Vec.fit(product_processed_data_output) model = word2Vec.fit(product_processed_data_output) # result = model.transform(title_df) # result # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'size_vocabulary': None, 'word_0_synonyms': [ (None, None), ], 'word_1_synonyms': [ (None, None), ], 'word_2_synonyms': [ (None, None), ] } # Modify res: res['count_total'] = product_processed_data_output.count() res['size_vocabulary'] = model.getVectors().count() for name, word in zip( ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'], [word_0, word_1, word_2]): res[name] = model.findSynonymsArray(word, 10) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_5') return res
def transform_data(spark, input_file, cfg): ''' transform data before writing to delta file ''' staging_fact = spark.read.parquet(input_file) dim_user_columns = [ 'cicid', 'i94yr', 'i94mon', 'biryear', 'gender', 'i94cit', 'i94res' ] dim_user = staging_fact.select(*dim_user_columns) user_rename_column = { 'cicid': 'cicid', 'i94yr': 'year', 'i94mon': 'month', 'biryear': 'birth_year', 'gender': 'gender', 'i94cit': 'i94cit', 'i94res': 'i94res' } dim_user = dim_user.toDF( *[user_rename_column[c] for c in dim_user.columns]) for c in ['i94cit', 'i94res']: dim_user = dim_user.withColumn( c, F.substring(F.col(c).cast(T.StringType()), 1, 3)) map_country_code = spark.read.csv(os.path.join( cfg['PATH']['DEV'], 'output/mapping_data/city_code.csv'), header=True) map_residence_code = spark.read.csv(os.path.join( cfg['PATH']['DEV'], 'output/mapping_data/residence_code.csv'), header=True) dim_user = dim_user.join(map_country_code, 'i94cit', 'left') dim_user = dim_user.join(map_residence_code, 'i94res', 'left') dim_user = dim_user.drop(*['i94res', 'i94cit']) dim_user = dim_user.withColumn('born_country', F.lower(F.col('born_country'))) dim_user = dim_user.withColumn('residence_country', F.lower(F.col('residence_country'))) dim_user = dim_user.withColumn('load_data_timestamp', F.lit(datetime.datetime.now())) return dim_user
def customer_email_match(): ccDF = spark.sql(""" select dl_filename,dl_line_no, dl_file_prefix,dl_file_timestamp, lower(trim(email_id)) as email_id from dl_business.contact_bkp_18062018 where (email_id is not null OR trim(email_id) = '' OR trim(lower(email_id)) = 'null') group by dl_filename,dl_line_no,dl_file_timestamp,dl_file_prefix,email_id """) ccDF = ccDF.dropDuplicates( ['dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'email_id']) emailexclDF = spark.sql(""" select lower(trim(email_address)) as email_address from ods_planninginc_data.scv_email_exclusions """) ccDF = ccDF.join( broadcast(emailexclDF), ccDF.email_id == emailexclDF.email_address, "left_outer").where(emailexclDF.email_address.isNull()).select( [F.col(c) for c in ccDF.columns]) ccDF = ccDF.withColumn( "email_str", F.trim(F.lower(F.split(ccDF['email_id'], "@")[0]))).withColumn( "email_domain", F.trim(F.lower(F.split(ccDF['email_id'], "@")[1]))) ccDFFilt = ccDF.filter( F.col("email_domain").isNotNull() | ~(str_isnumeric_udf(F.col("email_str")))) ccDFFilt = ccDFFilt.na.drop(subset=["email_str", "email_domain"]) ccDFFilt = ccDFFilt.withColumn( "std_email_str", email_clean_udf(F.col("email_str"))).withColumn( "std_email_domain", email_clean_udf(F.col("email_domain"))).select( 'dl_filename', 'dl_line_no', 'dl_file_prefix', 'dl_file_timestamp', 'email_id', 'std_email_str', 'std_email_domain') ccDFFilt.createOrReplaceTempView("contact_clean_data") ccDFGrp = spark.sql(""" select std_email_str,std_email_domain from contact_clean_data group by std_email_str,std_email_domain """) ccDFGrp.createOrReplaceTempView("contact_uniq_date") emlmatchDF = spark.sql(""" select a.* from contact_clean_data a,contact_uniq_date b where a.std_email_str = b.std_email_str and a.std_email_domain = b.std_email_domain """) #emlmatchDF.persist() #emlmatchDF.createOrReplaceTempView("contact_match_data") print("Email Match Step is done") return emlmatchDF
def _clean_string_column(df: DataFrame, column_name: str) -> DataFrame: """ Standardize the characters in a StringType() column. Lower case them, then replace spaces with '_' :param df: The dataframe to perform the operation on. :param column_name: The column to perform the operation on. :return: The same dataframe with the formatted column. """ return df.withColumn(column_name, F.lower(F.regexp_replace(column_name, ' ', '_')))
def filter_tweets(txt_lst, df, col): ''' returns entries containing a word in txt_lst ''' res = spark.createDataFrame(sc.emptyRDD(), df.schema) for keyword in txt_lst: filt_tweets = df.filter(f.lower(col).contains(keyword.lower())) res = res.union(filt_tweets).distinct() return res
def get_subreddit_topics_df(subreddit_topics_csv): # convert subreddit topics csv to df, this will be used to get topics column subreddit_topics = spark.read\ .csv(subreddit_topics_csv, header='true', inferSchema='true') subreddit_topics = subreddit_topics\ .withColumn('subreddit', lower(col('subreddit'))) return subreddit_topics
def execute(self): """ Method classifies the recipes with 'beef' as an ingredient for difficulty. Difficulty is classified based on the total time required for prepping and cooking the food. Method extends on the super class TransformRecipes classification logic by applying a filter for the keyword 'beef' in ingredients column. :return: """ return super(BeefRecipes, self).execute().where(lower(col('ingredients')).like("%beef%"))
def create_sentiment(news_df): news_df['date'] = pd.to_datetime(news_df['date']) news_df = news_df[['date', 'symbol', 'combined_data', "sector_name"]] news_df = spark.createDataFrame(news_df) news_text_lower = news_df.select( news_df['date'], news_df['symbol'], functions.lower(news_df['combined_data']).alias("content"), news_df["sector_name"]) #text preprocessing df = news_text_lower.withColumn( 'text_non_asci', strip_non_ascii_udf(news_text_lower['content'])) rm_df = df.withColumn('clean_text', remove_features_udf(df['text_non_asci'])) regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol='clean_text', outputCol='token') stopWordsRemover = StopWordsRemover(inputCol='token', outputCol='no_stopword') my_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover]) reg_model = my_pipeline.fit(rm_df) reg_df = reg_model.transform(rm_df) reg_joined = reg_df.withColumn("fine_text", join_udf(reg_df['no_stopword'])) reg_joined1 = reg_joined.withColumn( 'eng_text', strip_nonenglish_udf(reg_joined['fine_text'])) clean_df = reg_joined1.select('date', 'symbol', 'eng_text', 'sector_name') #clean_df.show(2) #calculate sentiment score news_rdd = clean_df.rdd lemma_rdd = news_rdd.map(lemma) score_rdd = lemma_rdd.map(sentiment_analyzer_scores) news_sentiment = spark.createDataFrame(score_rdd).cache() news_sentiment = news_sentiment.withColumnRenamed('_1', 'date') news_sentiment = news_sentiment.withColumnRenamed('_2', 'symbol') news_sentiment = news_sentiment.withColumnRenamed('_3', 'sector_name') news_sentiment = news_sentiment.withColumnRenamed('_4', 'sentiment') news_sentiment = news_sentiment.withColumn( 'senti_label', senti_udf(news_sentiment['sentiment'])) stock_news = news_sentiment.toPandas() stock_news['date'] = stock_news['date'].apply( lambda x: pd.to_datetime(x).tz_convert('US/Eastern')) stock_news['date'] = stock_news['date'].dt.tz_localize(None) stock_news['date'] = stock_news['date'].dt.date stock_news['date'] = pd.to_datetime(stock_news['date']) return (stock_news)
def task_5(data_io, product_processed_data, word_0, word_1, word_2): # -----------------------------Column names-------------------------------- # Inputs: title_column = 'title' # Outputs: titleArray_column = 'titleArray' titleVector_column = 'titleVector' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ step = product_processed_data.select( title_column, F.lower(F.col(title_column)).alias('temp')) split = step.withColumn(titleArray_column, F.split(F.col('temp'), ' ').cast("array<string>")) word2vec = M.feature.Word2Vec(minCount=100, vectorSize=16, seed=SEED, numPartitions=4, inputCol=titleArray_column, outputCol=titleVector_column) model = word2vec.fit(split) # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'size_vocabulary': None, 'word_0_synonyms': [ (None, None), ], 'word_1_synonyms': [ (None, None), ], 'word_2_synonyms': [ (None, None), ] } # Modify res: res['count_total'] = split.count() res['size_vocabulary'] = model.getVectors().count() for name, word in zip( ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'], [word_0, word_1, word_2]): res[name] = model.findSynonymsArray(word, 10) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_5') return res
def process(self, cardo_context, cardo_dataframe): # type: (CardoContextBase, CardoDataFrame) -> CardoDataFrame new_columns = [] for column in cardo_dataframe.dataframe.schema: relevant_formatters = filter(lambda formatter: formatter.is_type(column.dataType), self.__formatters) new_column = relevant_formatters[0].convert(cardo_dataframe.dataframe[column.name]) new_column = F.lower(new_column).alias(column.name) new_columns.append(new_column) cardo_dataframe.dataframe = cardo_dataframe.dataframe.select(new_columns) return cardo_dataframe
def is_area_of_study(col): #function to check area of study @F.udf def f1(v): l = ['engineering', 'teaching', 'communications', 'animal ccience', 'science & math', 'law & government', 'architecture', 'business', 'culinary arts', 'performing arts', 'health profession', 'visual art & design', 'film/video', 'cosmetology', 'humanities & interdisciplinary', 'computer science & technology', 'project-based learning', 'hospitality, travel, & tourism', 'performing arts/visual art & design', 'environmental science', 'zoned'] if v in l: return True else: False x = col.cast('string') return F.count(F.when(f1(F.lower(x)) == True, x))
def getTrumpTwitt(df): parseDate = f.udf(lambda x: parser.parse(x), DateType()) return df.select( parseDate(df.created_at).alias('created_at'), f.lower(df.text).alias('text_lower'), 'user.id_str', 'user.screen_name', 'user.followers_count', df.retweet_count, df.id, f.col('entities.user_mentions.screen_name').alias('mentioned'), f.col('entities.hashtags.text').alias( 'hashtags')).where((f.col('text_lower').like('%trump%')) & (f.col('place.country_code') == 'US'))
def transform_countries(df: SparkDataFrame) -> SparkDataFrame: """Transform countries data Add a column of lower case country name for joining purpose :param df: the countries data frame to be transformed :return: the transformed countries frame """ df = df.withColumn('lower_name', lower(df['name'])) return df
def lower_array(df, array_col): """ this function will take an array<string> column in the passed dataframe and lowercase each element in the array :param df: input dataframe :param array_col: name of the array<string> column :return: dataframe with array<string> column with lowercased elements """ return df.withColumn(array_col, split(lower(concat_ws(",", col(array_col))), ","))
def bucket_engine(df): eng = F.lower(F.col("engine")) return df.withColumn( "engine", F.when(eng.like("google%"), "google") .when(eng.like("ddg%"), "duckduckgo") .when(eng.like("duckduckgo%"), "duckduckgo") .when(eng.like("bing%"), "bing") .otherwise("other"), )
def preprocess(self, df: DataFrame): preprocess_df = df.filter(~F.isnull("se_property")) \ .withColumn("se_label", F.lower(F.col("se_label"))) \ .filter(~F.isnull("se_label")) \ .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \ .withColumn("isSuspect", F.col("isSuspect").cast("int")) \ .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \ .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \ .withColumn("hour", F.hour("collector_tstamp")) return preprocess_df
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ #column_val = regexp_replace(column, "\p{Punct}", "") #return trim(lower(column_val)) word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word") return word
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ # assert(isinstance(column, pyspark.sql.column.Column)) assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>") columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "") # columnNoPunct = regexp_replace(column, string.punctuation, "") columnLowerCase = lower(columnNoPunct) columnTrimmed = trim(columnLowerCase) return columnTrimmed
from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(2) # COMMAND ---------- from pyspark.sql.functions import initcap df.select(initcap(col("Description"))).show() # COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ----------
# COMMAND ---------- import pyspark.sql.functions as func from pyspark.sql.functions import col filtered = dfSmall.filter((col("title") != "<PARSE ERROR>") & col("redirect_title").isNull() & col("text").isNotNull()) # COMMAND ---------- # MAGIC %md # MAGIC Change all text to lower case. # COMMAND ---------- lowered = filtered.select("*", func.lower(col("text")).alias("lowerText")) # COMMAND ---------- parsed = lowered.drop("text").withColumnRenamed("lowerText", "text") # COMMAND ---------- # MAGIC %md # MAGIC Split the Wikipedia text into sentences. # COMMAND ---------- pattern = r"(\. |\n{2,})" import re
# COMMAND ---------- import pyspark.sql.functions as func from pyspark.sql.functions import col filtered = dfSmall.filter((col('title') != '<PARSE ERROR>') & col('redirect_title').isNull() & col('text').isNotNull()) # COMMAND ---------- # MAGIC %md # MAGIC Change all text to lower case. # COMMAND ---------- lowered = filtered.select('*', func.lower(col('text')).alias('lowerText')) # COMMAND ---------- parsed = (lowered .drop('text') .withColumnRenamed('lowerText', 'text')) # COMMAND ---------- # MAGIC %md # MAGIC Split the Wikipedia text into sentences. # COMMAND ---------- pattern = r'(\. |\n{2,})'
from lib.pos_tags import PosTags from lib.chunks import Chunks t = Tokens() p = PosTags() c = Chunks() c.train(c.load_training_data("../data/chunker_training_50_fixed.json")) def pipeline(s): """ Given a string, return a list of relations """ return c.assemble(c.tag(p.tag(t.tokenize(s)))) pipeline_udf = sql.udf(pipeline, types.ArrayType(types.MapType(types.StringType(), types.StringType()))) phrases = ( notes.withColumn("phrases", pipeline_udf(notes["document"])) .select(sql.explode(sql.col("phrases")).alias("text")) .filter(sql.col("text")["tag"] == "NP") .select(sql.lower(sql.col("text")["phrase"]).alias("phrase")) .groupBy(sql.col("phrase")) .count() ) phrases.write.parquet("../data/idigbio_phrases.parquet")
withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam')) withCEST.printSchema() (withCEST .select('timestamp', 'cest_time') .show(3, False)) # COMMAND ---------- # MAGIC %md # MAGIC Next, let's convert the text field to lowercase. We'll use the `lower` function for this. # COMMAND ---------- lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText')) print lowered.select('lowerText').first() # COMMAND ---------- # MAGIC %md # MAGIC What columns do we have now? # COMMAND ---------- print lowered.columns # COMMAND ---------- # MAGIC %md