Python lower示例，pyspark.sql.functions.lower Python示例

示例#1

0

显示文件

文件： 04-Making a function more widely reusable.py 项目： chandrainf/Datacamp

def main():
    spark = SparkSession.builder.getOrCreate()
    frame = extract_demographics(spark, catalog)
    chinese_demographics = frame.filter(lower(col("country")) == "china")
    aggregated_demographics = aggregate_inhabitants_by_province(
        chinese_demographics)
    store_chinese_demographics(aggregated_demographics, catalog)

示例#2

0

显示文件

    def clean_data(self, list_words_col: str) -> pyspark.sql.DataFrame:
        """Pre-processing data
        Processing executed in function:
        - Lower case
        - Words start with letters or whitespace
        - Remove whitespaces into start and final words
        - Remove rows empty
        - Transform each row in list
        :Args:
            :param list_words_col: column's name of Dataframe
        :Returns:
            :return: Dataframe with a word list in each row
        :Samples:
            +--------------------+
            |               value|
            +--------------------+
            |[over, in, one, n...|
        """
        self.__df_doc = self.__df_doc \
            .withColumn(list_words_col, lower(col(list_words_col))) \
            .withColumn(list_words_col, regexp_replace(str=col(list_words_col),
                                                       pattern='[^a-z ]',
                                                       replacement='')) \
            .withColumn(list_words_col, trim(col(list_words_col))) \
            .filter(self.__df_doc[list_words_col] != "") \
            .withColumn(list_words_col, split(list_words_col, ' '))

        return self

示例#3

0

显示文件

文件： WordCount.py 项目： sjoerdapp/TrendingProductsRecommender

 def _get_by_keywords(self, where_filter: str):
     prog = re.compile(r'"(.*)"')
     counts = []
     query = self.df
     for category in self.terms:
         if category != 'general':
             cat_terms = self.terms[category]
             if cat_terms:
                 for term in cat_terms:
                     match = prog.match(term)
                     if match:
                         print(match.groups())
                         like_expr = '%' + '%'.join(
                             match.groups()).lower() + '%'
                     else:
                         like_expr = '%' + term.lower().replace(
                             '%', '\%').replace(' ', '%').replace('"',
                                                                  '') + '%'
                     col_name = term.lower().replace('%', '\%').replace(
                         ' ', '_').replace('"', '')
                     query = query.withColumn(
                         col_name,
                         sql_funcs.lower(
                             sql_funcs.col('text')).like(like_expr))
                     counts.append(
                         sql_funcs.sum(
                             sql_funcs.when(sql_funcs.col(col_name),
                                            1)).alias(col_name))
     if where_filter == 'none':
         return json.loads(query.select(counts).toJSON().collect()[0])
     else:
         return json.loads(
             query.filter(where_filter).select(counts).toJSON().collect()
             [0])

示例#4

0

显示文件

def get_tokenized_df(partitioned_df):
    #split into individual words
    tokenizer = Tokenizer(inputCol='body', outputCol='words_token')
    tokenized_but_unsplit_still_has_stop_words_and_punctuation_df = tokenizer\
    .transform(partitioned_df)\
    .select('topic','date_time','month','date', 'words_token')
    
    #remove stop words
    remover = StopWordsRemover(inputCol='words_token', 
                               outputCol='words_no_stops')
    tokenized_but_unsplit_still_has_punctuation_df = remover\
    .transform(tokenized_but_unsplit_still_has_stop_words_and_punctuation_df)\
    .select('topic','words_no_stops','date_time','month','date')
    
    #remove punctuation
    tokenized_but_unsplit_df = tokenized_but_unsplit_still_has_punctuation_df\
    .withColumn('words_and_punct', explode('words_no_stops'))\
    .select('topic','words_and_punct','date_time','month','date')
    
    df_split_has_nonletters_and_uppercase = tokenized_but_unsplit_df\
    .withColumn('word', explode(split(col('words_and_punct'), '[\W_]+')))\
    .select('topic','word','date_time','month','date')
    
    df_split_has_nonletters = df_split_has_nonletters_and_uppercase\
    .withColumn('word', lower(col('word')))
    
    tokenized_and_split_has_duplicates = df_split_has_nonletters\
    .filter(df_split_has_nonletters['word'].rlike('[a-zA-Z]'))
    
    #duplicates dropped to ignore cases of someone using a word in the same post
    tokenized_df = tokenized_and_split_has_duplicates.dropDuplicates()
    
    return tokenized_df

示例#5

0

显示文件

文件： wordcount.py 项目： kutycoi123/data-science-stuff

def main(in_dir, out_dir):
    df = spark.read.text(in_dir, wholetext=True)
    df = df.select(f.explode(f.split('value', wordbreak)).alias('word'))
    df = df.select(f.lower('word').alias('word'))
    df = df.groupBy('word').agg(f.count('word').alias('count')).orderBy(
        ['count', 'word'], ascending=False).cache()
    df.write.csv(out_dir, mode='overwrite', compression='none')

示例#6

0

显示文件

def tf_idf(df, n):
    # Extracting terms per each row/document as a list
    temp_df = df.withColumn(
        'terms',
        f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')),
                ' '))

    # Calculating total number of words per row/document
    temp_df1 = temp_df.withColumn('total_num_words', f.size('terms'))

    # Extracting words in each documents
    temp_df2 = temp_df1.withColumn('token', f.explode('terms'))

    # Calculating tf
    temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({
        'token':
        'count'
    }).withColumnRenamed('count(token)', 'occurrence').sort('_id')
    temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence)

    # Calculating df
    temp_df5 = temp_df4.groupBy('token').agg(
        f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df')

    # Calculating idf
    temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df))

    # Calculating tf-idf
    joined_df = temp_df4.join(temp_df6,
                              temp_df4.token == temp_df6.token).select(
                                  temp_df4.token, temp_df4._id, temp_df4.tf,
                                  temp_df6.df, temp_df6.idf)
    result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf)

    return result

示例#7

0

显示文件

文件： evaluate_word2vec.py 项目： MGabr/evaluate-glint-word2vec

def words_and_vecs_from_csv(spark, model, csv_filename):
    if spark:
        schema = StructType([
            StructField("word1", StringType(), False),
            StructField("word2", StringType(), False)
        ])
        df = spark.read.csv(csv_filename, header=False, schema=schema)
        df = df.select(
            lower(col("word1")).alias("word1"),
            lower(col("word2")).alias("word2"))

        rows = df.collect()
        words1 = [row.word1 for row in rows]
        words2 = [row.word2 for row in rows]

        as_array = udf(lambda s: [s], ArrayType(StringType(), False))
        df = df.withColumn("word1", as_array("word1")).withColumn(
            "word2", as_array("word2"))
        wordvecs1 = [
            row.model for row in model.transform(
                df.withColumnRenamed("word1", "sentence")).collect()
        ]
        wordvecs2 = [
            row.model for row in model.transform(
                df.withColumnRenamed("word2", "sentence")).collect()
        ]
    else:
        words1 = []
        words2 = []
        with open(csv_filename, "r") as csv_file:
            for line in csv_file:
                row = line.strip("\n").split(",")
                words1.append(row[0])
                words2.append(row[1])

        wordvecs1 = [
            model.wv[word1]
            if word1 in model.wv else zeros(model.wv.vector_size)
            for word1 in words1
        ]
        wordvecs2 = [
            model.wv[word2]
            if word2 in model.wv else zeros(model.wv.vector_size)
            for word2 in words2
        ]

    return words1, words2, wordvecs1, wordvecs2

示例#8

0

显示文件

文件： text_tokenizer.py 项目： griddynamics/ml-dsl

def prepare_df(path, const, words_to_index):
    rdd = sc.textFile(path)
    row = Row("review")
    df = rdd.map(row).toDF()
    # Clean text
    df_clean = df.select(F.lower(F.regexp_replace(F.col('review'), "n't", " n't")).alias('review'))
    df_clean = df_clean.select(F.lower(F.regexp_replace(F.col('review'), "[^0-9a-zA-Z\\s]", "")).alias('review'))
    # Tokenize text
    tokenizer = Tokenizer(inputCol='review', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select('words_token')
    df_cutted = df_words_token.withColumn('length', F.size(F.col('words_token')))
    # Replace word with it's index
    word_udf = F.udf(lambda row: [words_to_index[w] if w in words_to_index.keys()
                                  else words_to_index["unk"] for w in row],
                 ArrayType(IntegerType()))
    df_stemmed = df_cutted.withColumn('words_stemmed', word_udf(F.col('words_token')))
    return df_stemmed.withColumn("class", F.lit(const))

示例#9

0

显示文件

文件： cs105_lab1b_word_count.py 项目： Teitlax/ApacheSpark

def removePunctuation(column):

    no_punct = regexp_replace(column, "\p{Punct}", '')
    lowered = lower(no_punct)
    cleaned = trim(lowered)
    return cleaned

    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

示例#10

0

显示文件

def join():
    stream_df = spark.read.json('stream_cleanned')
    giant_df = spark.read.json('game_info_base')

    stream_df = stream_df.withColumn('gen_name',
                                     functions.lower(stream_df.game))
    stream_df = stream_df.select('stream_id', 'gen_name').distinct()
    game_info = stream_df.groupBy('gen_name').count().cache()

    real_game_info = game_info.join(
        giant_df, game_info.gen_name == functions.lower(giant_df.name), 'left')

    real_game_info = real_game_info.orderBy(real_game_info.count.desc())

    real_game_info.where(real_game_info.count < 100).count().show()

    real_game_info.coalesce(1).write.json('real_game_info', mode='overwrite')

示例#11

0

显示文件

文件： custom_transformers.py 项目： canfielder/DSBA-6190_Proj3_Spark_w_Sagemaker

    def _transform(self, df):

        out_col = self.getOutputCol()
        in_col = self.getInputCol()

        df_lower = df.withColumn(out_col, f.lower(f.col(in_col)))

        return df_lower

示例#12

0

显示文件

文件： service.py 项目： DivyaVashistha/hackathon2020

 def lower_column(self, column):
     try:
         self.spark_df = self.spark_df.withColumn('temp', f.lower(f.col(column))).drop(column) \
             .withColumnRenamed('temp', column)
         return self.get_json_df_response()
     except Exception as e:
         print(e)
         return None

示例#13

0

显示文件

def lower_case_cols(data):
    data_dtypes = {col[0]: col[1] for col in data.dtypes}

    for column in data_dtypes.keys():
        if data_dtypes[column] == 'string':
            data = data.withColumn(column, F.lower(F.col(column)))

    return data

示例#14

0

显示文件

文件： resume_works.py 项目： reganzm/ai

def preprocess_works(df, output_path):
    """
    简历工作经历信息预处理
    :param df: 工作经历数据，DataFrame数据类型
    :param output_path: 输出文件路径
    :return: 
    """

    # 预处理后输出字段
    output_columns = [
        "resume_id", "company_name", "industry", "position_name",
        "position_title", "position_category", "work_start_date",
        "work_end_date", "work_start_year", "work_end_year", "work_index",
        "salary_min", "salary_max"
    ]

    # 归一化处理公司名称，公司行业，职位名称
    df = df.withColumn(
        "company_name",
        F.lower(F.trim(F.udf(normalization_company_name)(df.company_name))))
    df = df.withColumn(
        "industry",
        F.lower(F.trim(F.udf(normalization_industry)(df.industry_category))))
    df = df.withColumn(
        "position_name",
        F.lower(F.trim(F.udf(normalization_position_name)(df.position_title))))
    df = df.withColumn(
        "position_category",
        F.lower(F.trim(F.udf(get_position_level1)(df.position_name))))
    df = df.withColumn("start_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.start_time))
    df = df.withColumn("end_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.end_time))
    df = df.withColumn("work_start_date",
                       F.date_format(df.start_time, "yyy-MM"))
    df = df.withColumn("work_end_date", F.date_format(df.end_time, "yyy-MM"))
    df = df.withColumn("work_start_year", F.year(df.start_time))
    df = df.withColumn("work_end_year", F.year(df.end_time))
    df = df.withColumn("work_index", df.index)

    # 过滤公司、行业、职位全为空的字段
    df = df.dropna(how="all",
                   subset=["company_name", "industry", "position_name"])

    # 将简历基本信息预处理后的结果保存至指定文件
    df.select(output_columns).write.mode('overwrite').save(output_path)

示例#15

0

显示文件

文件： names.py 项目： markusnissl/recsyschallenge2018

def clean_text(c):
    return c
    c = lower(c)
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
    #c = split(c, "\\s+") tokenization...
    return c

示例#16

0

显示文件

文件： Sample_1 4.38.05 PM.py 项目： BingQZhou/CodeHonestly

def task_5(data_io, product_processed_data, word_0, word_1, word_2):
    # -----------------------------Column names--------------------------------
    # Inputs:
    title_column = 'title'
    PATTERN = ' '
    # Outputs:
    titleArray_column = 'titleArray'
    titleVector_column = 'titleVector'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    product_processed_data_output = product_processed_data.select(
        F.split(F.lower(F.col(title_column)), PATTERN))

    word2Vec = M.feature.Word2Vec(vectorSize=16,
                                  minCount=100,
                                  numPartitions=4,
                                  inputCol="split(lower(title),  )",
                                  outputCol="result",
                                  seed=5)

    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)

    #     result = model.transform(title_df)
    #     result
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'size_vocabulary': None,
        'word_0_synonyms': [
            (None, None),
        ],
        'word_1_synonyms': [
            (None, None),
        ],
        'word_2_synonyms': [
            (None, None),
        ]
    }
    # Modify res:
    res['count_total'] = product_processed_data_output.count()
    res['size_vocabulary'] = model.getVectors().count()
    for name, word in zip(
        ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'],
        [word_0, word_1, word_2]):
        res[name] = model.findSynonymsArray(word, 10)
    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_5')
    return res

示例#17

0

显示文件

文件： load_dim_user.py 项目： Pathairush/data_engineering_nanodegree

def transform_data(spark, input_file, cfg):
    '''
    transform data before writing to delta file
    '''

    staging_fact = spark.read.parquet(input_file)
    dim_user_columns = [
        'cicid', 'i94yr', 'i94mon', 'biryear', 'gender', 'i94cit', 'i94res'
    ]
    dim_user = staging_fact.select(*dim_user_columns)
    user_rename_column = {
        'cicid': 'cicid',
        'i94yr': 'year',
        'i94mon': 'month',
        'biryear': 'birth_year',
        'gender': 'gender',
        'i94cit': 'i94cit',
        'i94res': 'i94res'
    }

    dim_user = dim_user.toDF(
        *[user_rename_column[c] for c in dim_user.columns])
    for c in ['i94cit', 'i94res']:
        dim_user = dim_user.withColumn(
            c, F.substring(F.col(c).cast(T.StringType()), 1, 3))

    map_country_code = spark.read.csv(os.path.join(
        cfg['PATH']['DEV'], 'output/mapping_data/city_code.csv'),
                                      header=True)
    map_residence_code = spark.read.csv(os.path.join(
        cfg['PATH']['DEV'], 'output/mapping_data/residence_code.csv'),
                                        header=True)

    dim_user = dim_user.join(map_country_code, 'i94cit', 'left')
    dim_user = dim_user.join(map_residence_code, 'i94res', 'left')
    dim_user = dim_user.drop(*['i94res', 'i94cit'])

    dim_user = dim_user.withColumn('born_country',
                                   F.lower(F.col('born_country')))
    dim_user = dim_user.withColumn('residence_country',
                                   F.lower(F.col('residence_country')))
    dim_user = dim_user.withColumn('load_data_timestamp',
                                   F.lit(datetime.datetime.now()))

    return dim_user

示例#18

0

显示文件

文件： customer_match.py 项目： ashishforgit/customer-matching

def customer_email_match():
    ccDF = spark.sql("""
			select dl_filename,dl_line_no, dl_file_prefix,dl_file_timestamp, lower(trim(email_id)) as email_id
			from dl_business.contact_bkp_18062018 where (email_id is not null OR trim(email_id) = '' OR trim(lower(email_id)) = 'null') 
			group by dl_filename,dl_line_no,dl_file_timestamp,dl_file_prefix,email_id
			""")
    ccDF = ccDF.dropDuplicates(
        ['dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'email_id'])
    emailexclDF = spark.sql("""
			select lower(trim(email_address)) as email_address from ods_planninginc_data.scv_email_exclusions
			""")
    ccDF = ccDF.join(
        broadcast(emailexclDF), ccDF.email_id == emailexclDF.email_address,
        "left_outer").where(emailexclDF.email_address.isNull()).select(
            [F.col(c) for c in ccDF.columns])
    ccDF = ccDF.withColumn(
        "email_str",
        F.trim(F.lower(F.split(ccDF['email_id'], "@")[0]))).withColumn(
            "email_domain", F.trim(F.lower(F.split(ccDF['email_id'], "@")[1])))
    ccDFFilt = ccDF.filter(
        F.col("email_domain").isNotNull()
        | ~(str_isnumeric_udf(F.col("email_str"))))
    ccDFFilt = ccDFFilt.na.drop(subset=["email_str", "email_domain"])
    ccDFFilt = ccDFFilt.withColumn(
        "std_email_str", email_clean_udf(F.col("email_str"))).withColumn(
            "std_email_domain", email_clean_udf(F.col("email_domain"))).select(
                'dl_filename', 'dl_line_no', 'dl_file_prefix',
                'dl_file_timestamp', 'email_id', 'std_email_str',
                'std_email_domain')
    ccDFFilt.createOrReplaceTempView("contact_clean_data")
    ccDFGrp = spark.sql("""
				select std_email_str,std_email_domain 
				from contact_clean_data 
				group by std_email_str,std_email_domain
				""")
    ccDFGrp.createOrReplaceTempView("contact_uniq_date")
    emlmatchDF = spark.sql("""
				select a.* 
				from contact_clean_data a,contact_uniq_date b 
				where a.std_email_str = b.std_email_str and a.std_email_domain = b.std_email_domain
				""")
    #emlmatchDF.persist()
    #emlmatchDF.createOrReplaceTempView("contact_match_data")
    print("Email Match Step is done")
    return emlmatchDF

示例#19

0

显示文件

def _clean_string_column(df: DataFrame, column_name: str) -> DataFrame:
    """
    Standardize the characters in a StringType() column. Lower case them, then replace spaces with '_'
    :param df: The dataframe to perform the operation on.
    :param column_name: The column to perform the operation on.
    :return: The same dataframe with the formatted column.
    """
    return df.withColumn(column_name,
                         F.lower(F.regexp_replace(column_name, ' ', '_')))

示例#20

0

显示文件

文件： main.py 项目： Jbudnick/Spark-Case-Study-Public

def filter_tweets(txt_lst, df, col):
    '''
    returns entries containing a word in txt_lst
    '''
    res = spark.createDataFrame(sc.emptyRDD(), df.schema)
    for keyword in txt_lst:
            filt_tweets = df.filter(f.lower(col).contains(keyword.lower()))
            res = res.union(filt_tweets).distinct()
    return res

示例#21

0

显示文件

def get_subreddit_topics_df(subreddit_topics_csv):
    # convert subreddit topics csv to df, this will be used to get topics column
    subreddit_topics = spark.read\
    .csv(subreddit_topics_csv, header='true', inferSchema='true')

    subreddit_topics = subreddit_topics\
    .withColumn('subreddit', lower(col('subreddit')))

    return subreddit_topics

示例#22

0

显示文件

 def execute(self):
     """
     Method classifies the recipes with 'beef' as an ingredient for difficulty. Difficulty is
     classified based on the total time required for prepping and cooking the food. Method
     extends on the super class TransformRecipes classification logic by applying a filter for
     the keyword 'beef' in ingredients column.
     :return:
     """
     return super(BeefRecipes, self).execute().where(lower(col('ingredients')).like("%beef%"))

示例#23

0

显示文件

文件： sentiment_generation.py 项目： padamshrestha/Stock_Portfolio_Management

def create_sentiment(news_df):

    news_df['date'] = pd.to_datetime(news_df['date'])
    news_df = news_df[['date', 'symbol', 'combined_data', "sector_name"]]
    news_df = spark.createDataFrame(news_df)

    news_text_lower = news_df.select(
        news_df['date'], news_df['symbol'],
        functions.lower(news_df['combined_data']).alias("content"),
        news_df["sector_name"])

    #text preprocessing
    df = news_text_lower.withColumn(
        'text_non_asci', strip_non_ascii_udf(news_text_lower['content']))
    rm_df = df.withColumn('clean_text',
                          remove_features_udf(df['text_non_asci']))

    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='clean_text',
                                    outputCol='token')
    stopWordsRemover = StopWordsRemover(inputCol='token',
                                        outputCol='no_stopword')

    my_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover])
    reg_model = my_pipeline.fit(rm_df)
    reg_df = reg_model.transform(rm_df)

    reg_joined = reg_df.withColumn("fine_text",
                                   join_udf(reg_df['no_stopword']))
    reg_joined1 = reg_joined.withColumn(
        'eng_text', strip_nonenglish_udf(reg_joined['fine_text']))
    clean_df = reg_joined1.select('date', 'symbol', 'eng_text', 'sector_name')
    #clean_df.show(2)

    #calculate sentiment score

    news_rdd = clean_df.rdd
    lemma_rdd = news_rdd.map(lemma)
    score_rdd = lemma_rdd.map(sentiment_analyzer_scores)
    news_sentiment = spark.createDataFrame(score_rdd).cache()

    news_sentiment = news_sentiment.withColumnRenamed('_1', 'date')
    news_sentiment = news_sentiment.withColumnRenamed('_2', 'symbol')
    news_sentiment = news_sentiment.withColumnRenamed('_3', 'sector_name')
    news_sentiment = news_sentiment.withColumnRenamed('_4', 'sentiment')
    news_sentiment = news_sentiment.withColumn(
        'senti_label', senti_udf(news_sentiment['sentiment']))
    stock_news = news_sentiment.toPandas()

    stock_news['date'] = stock_news['date'].apply(
        lambda x: pd.to_datetime(x).tz_convert('US/Eastern'))
    stock_news['date'] = stock_news['date'].dt.tz_localize(None)
    stock_news['date'] = stock_news['date'].dt.date
    stock_news['date'] = pd.to_datetime(stock_news['date'])
    return (stock_news)

示例#24

0

显示文件

文件： dsc102pa2.py 项目： jonxsong/ESFSA

def task_5(data_io, product_processed_data, word_0, word_1, word_2):
    # -----------------------------Column names--------------------------------
    # Inputs:
    title_column = 'title'
    # Outputs:
    titleArray_column = 'titleArray'
    titleVector_column = 'titleVector'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    step = product_processed_data.select(
        title_column,
        F.lower(F.col(title_column)).alias('temp'))

    split = step.withColumn(titleArray_column,
                            F.split(F.col('temp'), ' ').cast("array<string>"))

    word2vec = M.feature.Word2Vec(minCount=100,
                                  vectorSize=16,
                                  seed=SEED,
                                  numPartitions=4,
                                  inputCol=titleArray_column,
                                  outputCol=titleVector_column)
    model = word2vec.fit(split)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'size_vocabulary': None,
        'word_0_synonyms': [
            (None, None),
        ],
        'word_1_synonyms': [
            (None, None),
        ],
        'word_2_synonyms': [
            (None, None),
        ]
    }
    # Modify res:

    res['count_total'] = split.count()
    res['size_vocabulary'] = model.getVectors().count()
    for name, word in zip(
        ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'],
        [word_0, word_1, word_2]):
        res[name] = model.findSynonymsArray(word, 10)

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_5')
    return res

示例#25

0

显示文件

	def process(self, cardo_context, cardo_dataframe):
		# type: (CardoContextBase, CardoDataFrame) -> CardoDataFrame
		new_columns = []
		for column in cardo_dataframe.dataframe.schema:
			relevant_formatters = filter(lambda formatter: formatter.is_type(column.dataType), self.__formatters)
			new_column = relevant_formatters[0].convert(cardo_dataframe.dataframe[column.name])
			new_column = F.lower(new_column).alias(column.name)
			new_columns.append(new_column)
		cardo_dataframe.dataframe = cardo_dataframe.dataframe.select(new_columns)
		return cardo_dataframe

示例#26

0

显示文件

 def is_area_of_study(col):                   #function to check area of study
     @F.udf
     def f1(v):
         l = ['engineering', 'teaching', 'communications', 'animal ccience', 'science & math', 'law & government', 'architecture', 'business', 'culinary arts', 'performing arts', 'health profession', 'visual art & design', 'film/video', 'cosmetology', 'humanities & interdisciplinary', 'computer science & technology', 'project-based learning', 'hospitality, travel, & tourism', 'performing arts/visual art & design', 'environmental science', 'zoned']
         if v in l:
             return True
         else:
             False
     x = col.cast('string')
     return F.count(F.when(f1(F.lower(x)) == True, x))

示例#27

0

显示文件

文件： Trump_Election_pyspark.py 项目： ngantran29/Trump_Tweets_Election

def getTrumpTwitt(df):
    parseDate = f.udf(lambda x: parser.parse(x), DateType())
    return df.select(
        parseDate(df.created_at).alias('created_at'),
        f.lower(df.text).alias('text_lower'), 'user.id_str',
        'user.screen_name', 'user.followers_count', df.retweet_count, df.id,
        f.col('entities.user_mentions.screen_name').alias('mentioned'),
        f.col('entities.hashtags.text').alias(
            'hashtags')).where((f.col('text_lower').like('%trump%'))
                               & (f.col('place.country_code') == 'US'))

示例#28

0

显示文件

文件： transforming.py 项目： nantsou/udacity-data-engineer

def transform_countries(df: SparkDataFrame) -> SparkDataFrame:
    """Transform countries data
    Add a column of lower case country name for joining purpose

    :param df: the countries data frame to be transformed
    :return: the transformed countries frame
    """

    df = df.withColumn('lower_name', lower(df['name']))
    return df

示例#29

0

显示文件

文件： transforms.py 项目： tylarr/pyspark-example-app

def lower_array(df, array_col):
    """
    this function will take an array<string> column in the passed dataframe
    and lowercase each element in the array
    :param df: input dataframe
    :param array_col: name of the array<string> column
    :return: dataframe with array<string> column with lowercased elements
    """
    return df.withColumn(array_col,
                         split(lower(concat_ws(",", col(array_col))), ","))

示例#30

0

显示文件

文件： helpers.py 项目： mozilla/addons_daily

def bucket_engine(df):
    eng = F.lower(F.col("engine"))
    return df.withColumn(
        "engine",
        F.when(eng.like("google%"), "google")
        .when(eng.like("ddg%"), "duckduckgo")
        .when(eng.like("duckduckgo%"), "duckduckgo")
        .when(eng.like("bing%"), "bing")
        .otherwise("other"),
    )

示例#31

0

显示文件

 def preprocess(self, df: DataFrame):
     preprocess_df = df.filter(~F.isnull("se_property")) \
         .withColumn("se_label", F.lower(F.col("se_label"))) \
         .filter(~F.isnull("se_label")) \
         .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \
         .withColumn("isSuspect", F.col("isSuspect").cast("int")) \
         .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \
         .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \
         .withColumn("hour", F.hour("collector_tstamp"))
     return preprocess_df

示例#32

0

显示文件

文件： cs105_lab1b_word_count.py 项目： HappyDash/apacheSpark

def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.
    Args:
        column (Column): A Column containing a sentence.
    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))

示例#33

0

显示文件

文件： cs105_lab1b_word_count.py 项目： librar127/DS_Courses

def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    #column_val = regexp_replace(column, "\p{Punct}", "")
    #return trim(lower(column_val))
    word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word")
    return word

示例#34

0

显示文件

文件： cs105_lab1b_word_count.py 项目： bdanalytics/Berkeley-Spark

def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    
#     assert(isinstance(column, pyspark.sql.column.Column))
    assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>")    
    
    columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "")
#     columnNoPunct = regexp_replace(column, string.punctuation, "")    
    columnLowerCase = lower(columnNoPunct)
    columnTrimmed = trim(columnLowerCase)
    
    return columnTrimmed

示例#35

0

显示文件

文件： Structured_APIs-Chapter_6_Working_with_Different_Types_of_Data.py 项目： yehonatc/Spark-The-Definitive-Guide

from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


# COMMAND ----------

from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

示例#36

0

显示文件

文件： Solutions.py 项目： Inscrutive/spark

# COMMAND ----------

import pyspark.sql.functions as func
from pyspark.sql.functions import col

filtered = dfSmall.filter((col("title") != "<PARSE ERROR>") & col("redirect_title").isNull() & col("text").isNotNull())

# COMMAND ----------

# MAGIC %md
# MAGIC Change all text to lower case.

# COMMAND ----------

lowered = filtered.select("*", func.lower(col("text")).alias("lowerText"))

# COMMAND ----------

parsed = lowered.drop("text").withColumnRenamed("lowerText", "text")

# COMMAND ----------

# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r"(\. |\n{2,})"
import re

示例#37

0

显示文件

文件： Tutorial.py 项目： Inscrutive/spark

# COMMAND ----------

import pyspark.sql.functions as func
from pyspark.sql.functions import col
filtered = dfSmall.filter((col('title') != '<PARSE ERROR>') &
                           col('redirect_title').isNull() &
                           col('text').isNotNull())

# COMMAND ----------

# MAGIC %md
# MAGIC Change all text to lower case.

# COMMAND ----------

lowered = filtered.select('*', func.lower(col('text')).alias('lowerText'))

# COMMAND ----------

parsed = (lowered
          .drop('text')
          .withColumnRenamed('lowerText', 'text'))

# COMMAND ----------

# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r'(\. |\n{2,})'

示例#38

0

显示文件

文件： find_phrases_job.py 项目： mjcollin/2016spr

from lib.pos_tags import PosTags
from lib.chunks import Chunks

t = Tokens()
p = PosTags()
c = Chunks()

c.train(c.load_training_data("../data/chunker_training_50_fixed.json"))


def pipeline(s):
    """
    Given a string, return a list of relations
    """
    return c.assemble(c.tag(p.tag(t.tokenize(s))))


pipeline_udf = sql.udf(pipeline, types.ArrayType(types.MapType(types.StringType(), types.StringType())))


phrases = (
    notes.withColumn("phrases", pipeline_udf(notes["document"]))
    .select(sql.explode(sql.col("phrases")).alias("text"))
    .filter(sql.col("text")["tag"] == "NP")
    .select(sql.lower(sql.col("text")["phrase"]).alias("phrase"))
    .groupBy(sql.col("phrase"))
    .count()
)

phrases.write.parquet("../data/idigbio_phrases.parquet")

示例#39

0

显示文件

文件： wiki-eda_answers.py 项目： smoltis/spark

withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam'))
withCEST.printSchema()

(withCEST
 .select('timestamp', 'cest_time')
 .show(3, False))

# COMMAND ----------

# MAGIC %md
# MAGIC Next, let's convert the text field to lowercase.  We'll use the `lower` function for this.

# COMMAND ----------

lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText'))

print lowered.select('lowerText').first()

# COMMAND ----------

# MAGIC %md
# MAGIC What columns do we have now?

# COMMAND ----------

print lowered.columns

# COMMAND ----------

# MAGIC %md