def toProcessTransData(): df1=rawdata[0].withColumn('yymmref',F.col("year")*100+F.lit(7)) \ .withColumn("ed_model_id",F.col("ed_model_id").cast(T.StringType())) \ .withColumn("yymminitial",F.when(F.col("yymmref")<F.col("yymminitial"),F.col("yymmref")).otherwise(F.col("yymminitial"))) \ .withColumn('iyear',F.col('yymminitial').substr(1,4).cast("integer")) \ .withColumn('imonth',F.col('yymminitial').substr(5,6).cast("integer")) \ .withColumn("idate",F.to_date(F.concat_ws("-","iyear","imonth",F.lit(1)))) \ .withColumn("adate",F.to_date(F.concat_ws("-","saleyear","salemonth",F.lit(1)))) \ .withColumn("fdate",F.lit(fdate)) \ .withColumn("age",F.months_between(F.col("adate"),F.col('idate'))+1) \ .withColumn("age",F.when(F.col('age')<1,1).otherwise(F.col('age')))\ .withColumn("agef",F.months_between(F.col("fdate"),F.col('idate'))+1) \ .withColumn("agef",F.when(F.col('agef')<1,1).otherwise(F.col('agef')))\ .withColumn("pdate",F.expr("add_months(fdate,0)")) \ .withColumn("tdate",F.expr("add_months(fdate,-3)")) \ .withColumn("syrmmt", F.year('tdate')*100+F.month('tdate')) \ .withColumn("psaleyear", F.year('pdate')) \ .withColumn("psalemonth", F.month('pdate'))\ .cache() extraagelist = df1.filter("age<0 and trans_count>=3").groupBy( 'year', 'make', 'model').agg(F.min(F.col("syrmm")).alias('age1')) df1 = df1.filter("age>0") #extraagelist%.write.format("delta").option("overwriteSchema", "true").mode("overwrite").saveAsTable("stats.nn_shortterm_extraagelist") print(extraagelist.count()) return df1
def getDataPlacePoor(path): path01 = 'hdfs://localhost:9000/csv/Join_Canton' data01 = spark.read.format('csv').option('header', 'true').load(path01) \ .where('Level= 04').drop('UpperCode', 'AllName', 'DT') \ .withColumn('CantonCode', F.split('CantonCode', '\d{4}$')) \ .withColumn('CantonCode', F.concat_ws("", "CantonCode")) data01 = data01.withColumn("CantonName", F.when(data01.CantonName == '建档立卡人员', '固阳县建档立卡人员') \ .otherwise(data01.CantonName)) data = spark.read.format('parquet').load(path) \ .select('PersonalType', 'AllName', 'DT', 'HosRegisterCode', 'CantonCode') \ .dropDuplicates(subset=['HosRegisterCode']) \ .where('PersonalType = 17') \ .withColumn('CantonCode', F.split('CantonCode', '\d{4}$')) \ .withColumn('CantonCode', F.concat_ws("", "CantonCode")) data = data.join(data01, on='CantonCode', how='left_outer') \ .drop('PersonalType', 'Level', 'CantonCode', 'AllName', 'ZoneCode', 'HosRegisterCode') \ .dropDuplicates(subset=['HosRegisterCode']) \ .withColumn('Times', F.lit(1)) \ .groupby('CantonName') \ .pivot('DT', ['2017', '2018', '2019']) \ .agg(F.sum('Times')) \ .fillna(0) data = data.orderBy(data['2019'].desc()) data.show(50) data.groupby().sum().show() # 和为21335
def read_arxiv(spark, processed_path): """Creates a dataframe with the columns: `id`: global id `source`: arxiv `source_id`: arxiv id `type`: publication `title` `venue`: concatenation of subjects `abstract` `scientists`: authors `organizations`: null `date`: publication date `content`: concatenation of abstract, affiliation, author, and journal """ arxiv_path = os.path.join(processed_path, 'arxiv.parquet') arxiv_df = spark.read.parquet(arxiv_path) return arxiv_df.select( fn.concat(fn.lit('arxiv_'), fn.col('id')).alias('id'), fn.lit('arxiv').alias('source'), fn.col('id').astype('string').alias('source_id'), fn.lit('publication').alias('type'), 'title', fn.concat_ws('; ', 'subjects').alias('venue'), 'abstract', fn.concat_ws(';', 'authors').alias('scientists'), fn.lit(None).astype('string').alias('organizations'), fn.col('datastamp').alias('date'), fn.concat_ws(' ', fn.col('abstract'), fn.col('title'), fn.concat_ws(' ', 'authors'), fn.concat_ws(' ', 'subjects')).alias('content'), fn.lit(None).astype('string').alias('end_date'), fn.lit(None).astype('string').alias('city'), fn.lit(None).astype('string').alias('country'), fn.lit(None).astype('string').alias('other_id'))
def extract_data(self): """Method to extract data from the csv file.""" works_data = self.data_path + '*' works_data_df = self.spark.read.load(works_data, format="csv", header="true") unicode_conversion = udf(lambda value: unicodedata.normalize( 'NFKD', value).encode('ascii', 'ignore').decode()) works_data_df = works_data_df.withColumn( 'converted_title', unicode_conversion(col('title'))) works_data_df = works_data_df.withColumn( 'converted_contributors', unicode_conversion(col('contributors'))) reconciled_data = works_data_df.select('*') \ .groupBy('iswc') \ .agg(concat_ws(', ', collect_set('converted_title')) \ .alias('title'), concat_ws('|', collect_set('converted_contributors')) \ .alias('contributors'), concat_ws(', ', collect_set('source')) \ .alias('sources')) \ .dropDuplicates() \ .na.drop() return reconciled_data
def create_values(cols): values = [] for col in cols: if col.is_lookup == 1: values.append( f.when( f.col(col.demographic_key).isNull(), f.concat_ws('_', f.lit(col.demographic_key), f.lit('9999'))).when( f.trim(f.col(col.demographic_key)) == '', f.concat_ws('_', f.lit(col.demographic_key), f.lit('9999'))). when( f.length( f.regexp_extract( f.col(col.demographic_key).astype('string'), '(\d+)', 1)) > 0, f.concat_ws( '_', f.lit(col.demographic_key), f.col(col.demographic_key).astype('int').astype( 'string'))).otherwise( f.concat_ws('_', f.lit(col.demographic_key), f.col(col.demographic_key)))) else: values.append(f.col(col.demographic_key)) return values
def createPipeline(readStream): split_col = split(readStream['value'], r" \[") message = split_col.getItem(1) systemDetails = split(split_col.getItem(0), ' ') currentYear = datetime.now().year month = systemDetails.getItem(1) date = systemDetails.getItem(2) time = systemDetails.getItem(3) source = systemDetails.getItem(4) fsm = split(split(systemDetails.getItem(5), '%').getItem(1), '-') facility = fsm.getItem(0) severity = fsm.getItem(1) mnemonic = fsm.getItem(2) udf = UserDefinedFunction(lambda x: MONTHS.get(x), StringType()) return readStream.withColumn('timestamp', concat_ws(' ', concat_ws('-', lit(currentYear), udf(month), date), time).astype(TimestampType())) \ .withColumn('source', source) \ .withColumn('facility', facility) \ .withColumn('severity', severity) \ .withColumn('mnemonic', mnemonic) \ .withColumn('message', concat(lit('['), message)) \ .selectExpr("to_json(struct(timestamp, source, facility, severity, mnemonic, message)) AS value")
def partition(spark, partition_config): unique_table = partition_config['unique_to'] spark.sql("DROP TABLE IF EXISTS {}".format(unique_table)) source = partition_config['source_tag'] target = partition_config['target_tag'] full = {} full[source] = spark.read.table(partition_config['full_source']) \ .select('sub') \ .dropDuplicates() \ .withColumn('key', F.lit('')) full[target] = spark.read.table(partition_config['full_target']) \ .select('sub') \ .dropDuplicates() \ .withColumn('key', F.lit('')) for p_conf in partition_config['partition_by']: no_attr = {} for src in [source, target]: p_by = spark.read.table(p_conf[src]) \ .select('sub', F.lower(F.col('obj')).alias('new_key')) profile = full[src].join(p_by, 'sub', 'left') profile.cache() no_attr[src] = profile.filter(F.col('new_key').isNull()) \ .select('sub', 'key') f = profile.filter(F.col('new_key').isNotNull()) \ .select('sub', 'key', F.concat_ws('$', 'key', 'new_key').alias('new_key')) f.cache() full[src] = f unique = find_unique(full[source], full[target], 'sub', 'new_key', source, target) \ .select('sub', 'db', F.col('new_key').alias('key')) unique.write.saveAsTable(unique_table, mode='append') # deal with ns t_k = full[target].select('key', 'new_key') \ .dropDuplicates() n_s = no_attr[source].withColumn('new_key', F.lit('NULL')) \ .select('sub', 'key', F.concat_ws('$', 'key', 'new_key').alias('new_key')) n_s_t = no_attr[source].join(t_k, 'key') # deal with nt s_k = full[source].select('key', 'new_key') \ .dropDuplicates() n_t = no_attr[target].withColumn('new_key', F.lit('NULL')) \ .select('sub', 'key', F.concat_ws('$', 'key', 'new_key').alias('new_key')) n_t_s = no_attr[target].join(s_k, 'key') full[source] = full[source].unionByName(n_s) \ .unionByName(n_s_t) \ .select('sub', F.col('new_key').alias('key')) full[target] = full[target].unionByName(n_t) \ .unionByName(n_t_s) \ .select('sub', F.col('new_key').alias('key')) full_s = full[source].withColumn('db', F.lit(source)) full_t = full[target].withColumn('db', F.lit(target)) result = full_s.unionByName(full_t) result.write.saveAsTable(partition_config['partition_to'], mode='overwrite') spark.catalog.clearCache()
def group_batched_logs(logs): # group logs from uckey + interval_time + keyword. # group 1: group by uckey + interval_starting_time + keyword df = logs.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg( first('keyword').alias('keyword'), fn.sum(col('is_click')).alias('kw_clicks_count'), fn.count(fn.when(col('is_click') == 0, 1).otherwise( 0)).alias('kw_shows_count') ) df = df.withColumn('kwi_clicks_count', concat_ws( ":", col('keyword_index'), col('kw_clicks_count'))) df = df.withColumn('kwi_shows_count', concat_ws( ":", col('keyword_index'), col('kw_shows_count'))) df = df.withColumn('kw_clicks_count', concat_ws( ":", col('keyword'), col('kw_clicks_count'))) df = df.withColumn('kw_shows_count', concat_ws( ":", col('keyword'), col('kw_shows_count'))) # group 2: group by uckey + interval_starting_time df = df.groupBy('uckey', 'interval_starting_time').agg( concat_ws(",", collect_list('keyword_index')).alias('kwi'), concat_ws(",", collect_list('kwi_clicks_count') ).alias('kwi_click_counts'), concat_ws(",", collect_list('kwi_shows_count') ).alias('kwi_show_counts'), concat_ws(",", collect_list('keyword')).alias('interval_keywords'), concat_ws(",", collect_list('kw_clicks_count') ).alias('kw_click_counts'), concat_ws(",", collect_list('kw_shows_count') ).alias('kw_show_counts') ) return df
def shortest_path(v_from, v_to, df_name, output, max_path_length=10): schema = StructType(fields=[ StructField("user_id", StringType()), StructField("follower_id", StringType()) ]) df = spark.read.schema(schema).format("csv").option("sep", "\t").load(df_name) df_sel = df.where(df.follower_id == v_from) df_paths = df_sel.select( f.concat_ws(",", "follower_id", "user_id").alias("path"), df_sel.user_id.alias("next")) for i in range(max_path_length): if df_paths.where(df_paths.next == v_to).count() == 0: df_ext = df_paths.join(df.select(df.follower_id.alias("next"), df.user_id), on="next", how="inner") df_paths = df_ext.select( f.concat_ws(",", "path", "user_id").alias("path"), df_ext.user_id.alias("next")) else: df_paths.select("path").where( df_paths.next == v_to).write.mode("overwrite").text(output) break spark.stop()
def _cross_features(need_cross_features: list, train_data: DataFrame, test_data: DataFrame): cross_features = list() for item in need_cross_features: if not isinstance(item, dict) \ or "feature_list" not in item.keys() \ or "hash_bucket_size" not in item.keys(): print("need_cross_features must be a dict " "with key 'feature_list' and 'hash_bucket_size' !") continue # 连续特征使用 分桶后的数据交叉 concat_features, hash_num = item["feature_list"], item[ "hash_bucket_size"] new_feature_name = config.HASH_FEATURE_PREFIX + "_".join( concat_features) train_data = \ train_data.withColumn( new_feature_name, _hash_value_udf(hash_num)(F.concat_ws("_", *concat_features)).cast("int")) test_data = \ test_data.withColumn( new_feature_name, _hash_value_udf(hash_num)(F.concat_ws("_", *concat_features)).cast("int")) cross_features.append(new_feature_name) print("generate new crossed features {0}. ".format(new_feature_name)) return train_data, test_data, cross_features
def test_concat_ws_nulls_arrays(): gen = ArrayGen(StringGen(nullable=True), nullable=True) assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, gen).select( f.concat_ws("*", f.lit('z'), f.array(f.lit('2'), f.lit(None), f.lit('Z'))), f.concat_ws("*", f.array(f.lit(None), f.lit(None))), f.concat_ws("*", f.array(f.lit(None), f.lit(None)), f.col('b'), f.lit('a'))))
def columnsMergeCore(df, requestDict): columnNames = requestDict['columnNames'] # 默认分隔符是",",若requestStr中指定了分隔符,则以用户指定为准 try: splitSymbol = requestDict['connector'] except: splitSymbol = ',' # 默认新列名称为:合并结果(col1, col2, col3, ...),若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = "合并结果" + "(" + str(columnNames).strip("[]") + ")" # 合并(spark的dataframe操作好蠢,暂时先用笨办法合并吧 >_< ) if len(columnNames) == 2: df = df.withColumn( newColumnName, concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]])) elif len(columnNames) == 3: df = df.withColumn( newColumnName, concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]], df[columnNames[2]])) elif len(columnNames) == 4: df = df.withColumn( newColumnName, concat_ws(splitSymbol, df[columnNames[0]], df[columnNames[1]], df[columnNames[2]], df[columnNames[3]])) return df
def token_score(df, on, value): q_val = value df = df.select([on]) df = df.withColumn('query', F.lit(q_val).cast(F.StringType())) # TODO: implement the pattent pattern = ',' df = df.withColumn('tokens1', F.split(F.col('left'), pattern)) df = df.withColumn('tokens2', F.split(F.col('right'), pattern)) # intersection = tokens1.intersection(tokens2) # diff1to2 = tokens1.difference(tokens2) = pure token 1 # diff2to1 = tokens2.difference(tokens1) = pure token 2 # TODO: implement an intersect and a diff method df = df.withColumn('intersection', F.intersect('tokens1', 'tokens2')) df = df.withColumn('diff1to2', F.diff('tokens1', 'tokens2')) df = df.withColumn('diff2to1', F.diff('tokens2', 'tokens1')) # sorted_sect = " ".join(sorted(intersection)) # sorted_1to2 = " ".join(sorted(diff1to2)) # sorted_2to1 = " ".join(sorted(diff2to1)) # TODO: implement a concat for an array df = df.withColumn('sorted_sect', F.concat_ws(' ', F.sort_array('intersection'))) df = df.withColumn('sorted_1to2 ', F.concat_ws(' ', F.sort_array('diff1to2'))) df = df.withColumn('sorted_2to1', F.concat_ws(' ', F.sort_array('diff2to1'))) # combined_1to2 = sorted_sect + " " + sorted_1to2 = chain 1 that has been sorted # combined_2to1 = sorted_sect + " " + sorted_2to1 = chain 2 that has been sorted # TODO: no, i'm joking df = df.withColumn('combined_1to2', F.concat_ws(' ', ['sorted_sect', 'sorted_1to2'])) df = df.withColumn('combined_1to2', F.concat_ws(' ', ['sorted_sect', 'sorted_2to1'])) # strip # sorted_sect = sorted_sect.strip() # combined_1to2 = combined_1to2.strip() # combined_2to1 = combined_2to1.strip() for c in ['sorted_sect', 'combined_1to2', 'combined_2to1']: df = df.withColumn(c, F.trim(c)) # TODO: create a function spark_ratio df = df.withColumn( 'ratio1', spark_ratio(F.col('sorted_sect', F.col('combined_1to2')))) df = df.withColumn( 'ratio2', spark_ratio(F.col('sorted_sect', F.col('combined_2to1')))) df = df.withColumn( 'ratio3', spark_ratio(F.col('combined_2to1', F.col('combined_1to2')))) # pairwise = [ # ratio_func(sorted_sect, combined_1to2), # ratio_func(sorted_sect, combined_2to1), # ratio_func(combined_1to2, combined_2to1) # ] df = df.withColumn('max_ratio', F.max(['ratio1', 'ratio2', 'ratio3'])) df = df.withColumnRenamed('max_ratio', 'token_fuzzy') df = df.select(['token_fuzzy']) return df
def pivot(df): """convert to wide format""" df = ( df # deterministic ordering for questions .withColumn('order_by', F.concat_ws('_', 'page_idx', 'question_idx')) .withColumn( 'order_by', F.when( F.col('family') != 'single_choice', F.concat_ws('_', 'order_by', F.coalesce('choice_id', 'row_id', 'other_id')) ).otherwise(F.col('order_by')) ) # enumerator for questions with same column name .withColumn('rank', F.dense_rank().over(Window.partitionBy('column').orderBy('question_id'))) # construct orderable column names .withColumn('column', F.concat_ws('_', F.lit('_'), 'order_by', 'column', 'rank')) .groupBy(RESPONSE_KEY) .pivot('column') .agg(F.first('value')) ) # set column order question_cols = set(df.columns) - set(RESPONSE_KEY) columns = RESPONSE_KEY + sorted(question_cols) df = df.select(*columns) # find single_choice questions with "Other" option questions_w_other = [] base = columns[0] for col in columns: b = re.sub(r'_\d+$', '', base) # don't consider enumerator # if column looks like `this_is_the_base_other` if b in col and 'other' in col: questions_w_other.append((base, col)) base = col # inject "Other" for single choice questions for base, other in questions_w_other: df = df.withColumn( base, F.when( F.col(other).isNotNull(), F.coalesce(F.col(base), F.lit('Other (please specify)'))).otherwise( F.col(base))) # drop __question_id prefixes and _1 suffixes names = df.columns names = map(lambda s: re.sub(r'^__[\d+_]+', '', s), names) names = map(lambda s: re.sub(r'_1$', '', s), names) df = df.toDF(*names) return df
def getNewAge(data): # 150221 1940 0212472x # data = spark.read.format('parquet').load(path) data = data.withColumn("Born", F.split('CertificateCode', '\d{7}.$')) \ .withColumn('Born', F.concat_ws("", "Born")) \ .withColumn('Born', F.split('Born', '\d{6}')) \ .withColumn('Born', F.concat_ws("", "Born")) data = data.withColumn("Age", F.when((data.DT - data.Born) != data.Age, data.DT - data.Born).otherwise(data.Age)) \ .drop('Born') data.show(30) dealNull(data)
def getNewAge(path, path1): # 150221 1940 0212472x data = spark.read.format('csv').option('header', 'true').load(path) data = data.withColumn("Born", F.split('CertificateCode', '\d{7}.$')) \ .withColumn('Born', F.concat_ws("", "Born")) \ .withColumn('Born', F.split('Born', '\d{6}')) \ .withColumn('Born', F.concat_ws("", "Born")) data = data.withColumn("Age", F.when((data.DT - data.Born) != data.Age, data.DT - data.Born).otherwise(data.Age)) \ .drop('Born') data.show() data.write.format('parquet').mode("overwrite").save(path1)
def df_structurize(input_df, struct): #metaColumns = struct.fieldNames() # new dataframe of the regex columns regexDFColumns = [c for c in input_df.columns if c[0].isdigit()] regexDFColumns.append("revid") regexDFColumns.append("date_time") regexDFColumns.append("articleid") regexDFColumns.append("namespace") regexDFColumns.append("anon") regexDFColumns.append("deleted") regexDFColumns.append("revert") regexDFColumns.append("reverteds") regex_df = input_df.na.replace('None', None).select(*regexDFColumns) #regex_df.show(n=5, vertical=True) # combine the regex columns into one column, if not None/null # this has: revid, article_id, date/time, regexes, core_regexes, regex_bool, core_bool onlyRegexCols = [c for c in regex_df.columns if c[0].isdigit()] coreDFColumn = findCoreColumns(onlyRegexCols) replaced_df = multi_replace_wps(onlyRegexCols)(regex_df) #test_df.select(regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'),f.concat_ws(', ',*onlyRegexCols).alias('regexes'), f.concat_ws(', ',*coreDFColumn).alias('core_regexes')).show(n=50, truncate=200) #print("If we didn't do the replace stuff:") #regex_df.select(regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'),f.concat_ws(', ',*onlyRegexCols).alias('regexes'), f.concat_ws(', ',*coreDFColumn).alias('core_regexes')).show(n=50, truncate =200) regex_one_df = replaced_df.select( regex_df.articleid, regex_df.namespace, regex_df.anon, regex_df.deleted, regex_df.revert, regex_df.reverteds, regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'), f.concat_ws(', ', *onlyRegexCols).alias('regexes'), f.concat_ws(', ', *coreDFColumn).alias('core_regexes')) # if you don't want to use the replaced version, use this: # regex_one_df = regex_df.select(regex_df.articleid, regex_df.namespace, regex_df.anon, regex_df.deleted, regex_df.revert, regex_df.reverteds, regex_df.revid, regex_df.date_time, f.year(regex_df.date_time).alias("year"), f.month(regex_df.date_time).alias('month'),f.concat_ws(', ',*onlyRegexCols).alias('regexes'), f.concat_ws(', ',*coreDFColumn).alias('core_regexes')) # make again sure the empty ones are None/null regex_one_df = regex_one_df.na.replace('', None) ## regex_bool and core_bool help us keep track of which revisions end in text that have PI # regex_one_df = regex_one_df.select(*regex_one_df, f.when(regex_one_df.regexes.isNotNull(),1).otherwise(0).alias('regex_bool'), f.when(regex_one_df.core_regexes.isNotNull(),1).otherwise(0).alias('core_bool')) #regex_one_df.show(n=5, vertical=True) return regex_one_df
def _setup_dataframe(spark, sqlContext, dataset_multiplier_factor, append_ids=True) -> pyspark.sql.DataFrame: """Setup a pyspark dataframe to run against. Then creates a PySpark dataframe, and crossjoins with a table of length :dataset_multiplier_factor: to increase the volume of data for benchmarking. Returns: A Pyspark dataframe with random phrases for string distance testing. """ df = _fetch_phrase_pairs() logger.info(f'{len(df):,} word pairs') pyspark_df = spark.createDataFrame(df, ['left', 'right']) pyspark_df = pyspark_df.repartition(10) pyspark_df.cache().count() logger.debug('Increasing data volume') range_df = sqlContext.range(dataset_multiplier_factor) if append_ids: range_df = range_df.withColumn('id_string', ps_funcs.lpad('id', 12, "0")) pyspark_df = range_df.crossJoin(pyspark_df).select( ps_funcs.concat_ws(' ', ps_funcs.col('left'), ps_funcs.col('id_string')).alias('left'), ps_funcs.concat_ws(' ', ps_funcs.col('right'), ps_funcs.col('id_string')).alias('right')) else: pyspark_df = range_df.crossJoin(pyspark_df).select( ps_funcs.col('left'), ps_funcs.col('right')) pyspark_df = pyspark_df.repartition(__DATASET_PARTITIONS) record_count = pyspark_df.cache().count() logger.info(f'Generated dataframe with {record_count:,} records') sample_data = pyspark_df.sample(withReplacement=False, fraction=0.01).limit(1).collect() logger.info(f'Sample of benchmarking data: {sample_data}') return pyspark_df
def test_concat_ws_arrays(): gen = ArrayGen(StringGen(nullable=True), nullable=True) (s1, s2) = gen_scalars(gen, 2, force_no_nulls=True) assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, gen).select( f.concat_ws("*", f.array(f.lit('2'), f.lit(''), f.lit('3'), f.lit('Z'))), f.concat_ws("*", s1, s2), f.concat_ws("-", f.array()), f.concat_ws("-", f.array(), f.lit('u')), f.concat_ws(None, f.lit('z'), s1, f.lit('b'), s2, f.array()), f.concat_ws("+", f.lit('z'), s1, f.lit('b'), s2, f.array()), f.concat_ws("*", f.col('b'), f.lit('z')), f.concat_ws("*", f.lit('z'), s1, f.lit('b'), s2, f.array(), f.col('b')), f.concat_ws("-", f.array(f.lit(None))), f.concat_ws("-", f.array(f.lit('')))))
def transfLineitems(ds): invoiceDS = ds invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \ .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto")) if invoiceDS.schema["TypeOfService"].dataType == ArrayType( StringType()): invoiceDS = invoiceDS.withColumn("TypeOfService", concat_ws(",", col("TypeOfService"))) \ .withColumn("ServiceAmount", concat_ws(",", col("ServiceAmount"))) invoiceDS = invoiceDS.withColumn( "TypeOfService", regexp_replace(col("TypeOfService"), "\n", " ")) return invoiceDS
def get_warning(): try: print("删除过期预警数据") levels=["YJFL004","YJFL012","YJFL003","YJFL001"] for level in levels: print(f"预警级别:{level}") delete_all(hbase["table"],row_prefix=level) print(f"{str(dt.now())} 预警") result = get_warning_result(white_list, city='岳阳市', com_id='011114306', day='20190601', cluster_dir=cluster_path+"/") #highprice_30days_order 里面的数值要是float类型 result["highprice_30days_order"] = result["highprice_30days_order"].apply( lambda x: json.dumps(x, ensure_ascii=False)) df = spark.createDataFrame(result)\ .withColumn("classify_id",f.concat_ws("_",col("classify_level1_code"),col("cust_id"))) cols=df.columns cols.remove("classify_id") df.foreachPartition(lambda x:write_hbase1(x,cols,hbase)) except Exception: tb.print_exc()
def preprocessDF(self, df, cols): """ Input: $df represents a DataFrame $cols represents the list of columns (in $df) that will be concatenated and be tokenized Output: Return a new DataFrame that adds the "joinKey" column into the input $df Comments: The "joinKey" column is a list of tokens, which is generated as follows: (1) concatenate the $cols in $df; (2) apply the tokenizer to the concatenated string Here is how the tokenizer should work: (1) Use "re.split(r'\W+', string)" to split a string into a set of tokens (2) Convert each token to its lower-case (3) Remove stop words """ stop_words = self.stopWordsBC def tokenized_filterized_string(string): string = re.sub('\s+', ' ', string).strip().lower( ) # Remove extra whitespace and finally remove trailing spaces tokens = re.split(r'\W+', string) stop_words.add('') tokens = set(tokens) - stop_words return list(tokens) get_tokenized_string = functions.udf( tokenized_filterized_string, types.ArrayType(types.StringType())) concatanated_column = 'joinKey' df = df.withColumn(concatanated_column, concat_ws(' ', df[cols[0]], df[cols[1]])) df = df.withColumn(concatanated_column, get_tokenized_string(df[concatanated_column])) return df
def preprocessDF(self, df, cols): new_df=df.withColumn("joinkey",concat_ws('-', *cols)) def transform(raw): words=[] s = re.split(r'\W+', raw) final_list=[] for i in s: fin_s=i.lower() if(len(fin_s)>0): final_list.append(fin_s) for i in final_list: if i not in stop_word: words.append(i) return words stop_word=self.stopWordsBC slen=udf(transform, ArrayType(StringType())) df1=new_df.withColumn("joinkey", slen(new_df.joinkey)) return df1
def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None): """ Function to load a supporting table to passengers from GCS and save in BigQuery. :param csv_filepath: str input filename :param uid_name: str name to give the UID column :param uid_col_list: list of str column names to combine into UID :param csv_bq: str output project.datset.table where the dat will be saved :param passenger_bq: str, optional. If passengers_df already has been loaded """ csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath) logger.info(f"Loading address info from {csv_path}") csv_df = self.sparkql.read.csv(csv_path, header=True) csv_df = csv_df.withColumn(uid_name, sha2(concat_ws("", *uid_col_list ), 256 )) if passenger_bq: passengers_df = self.sparkql.read.format('bigquery') \ .option('table', passenger_bq) \ .load() \ .withColumnRenamed('uid', 'passenger_uid') else: passengers_df = self.passengers_df.withColumnRenamed('uid', 'passenger_uid') csv_df = csv_df.join(passengers_df.select('email', 'passenger_uid'), on='email', how='left') logger.info(f"writing card data to {csv_bq}") csv_df.write.format('bigquery') \ .option('table', csv_bq) \ .save()
def verification(self, candDF, threshold): jaccard_udf = functions.udf(lambda r: jaccard_similarity(r)) jaccard_df = candDF.withColumn( "jaccard", jaccard_udf( functions.concat_ws(',', candDF.joinKey1, candDF.joinKey2))) return jaccard_df.where(jaccard_df.jaccard >= threshold)
def csv2data(self, donotvectorize): lists = self.list_fileNames() #print lists csvFiles = [] s3 = boto3.client('s3') for l in lists: csvFiles.append(l.encode('utf-8').split('/')[1]) for c in csvFiles: print c s3.download_file(INPUT_DATA_BUCKET, INPUT_DATA_FOLDER + c, "knn" + INPUT_DATA_TYPE) #datas = CreateDF.sc.wholeTextFiles("/home/ab/pyspark/knn"+INPUT_DATA_TYPE) #sessions = datas #print sessions.first() df = self.ReturnDataframe("/home/ab/pyspark/knn" + INPUT_DATA_TYPE) columns = df.columns columnlength = len(columns) print columnlength self.vectorize = list( set(self.dataframe.columns) - set(donotvectorize)) self.dataframe = self.dataframe.withColumn( "Features_Joined", concat_ws('_', *self.vectorize)) self.vectors = [i + "_index" for i in self.vectorize] df = self.StringtoVector(self.vectorize) y = self.dataframe.select(['Features_Joined' ]).rdd.map(lambda x: x[0]).collect() self.TransformDataframe(self.vectors)
def streaming_sent(dfX): # apply sentiment analysis to text stream df = pipeline.transform(dfX) # select sentiment column from pipeline output df = df.select('sentiment.result',"sentiment.metadata") \ .withColumn('result',F.concat_ws(',','result')) \ .withColumn("result", regexp_replace('result', "positive",'1')) \ .withColumn("result", regexp_replace('result', "na",'0')) \ .withColumn("result", regexp_replace('result', "negative",'-1')) \ .select(F.split('result', ',').alias('sents'), 'metadata') # Convert datatypes mapper = F.udf(lambda x: [i['confidence'] for i in x], T.ArrayType(T.StringType())) df = df.withColumn("metadata", mapper('metadata')) df = df.withColumn("metadata", df.metadata.cast("array<float>")) # Compute column product df_product = df.withColumn( "product", F.expr( "transform(arrays_zip(sents, metadata), x -> x.sents * x.metadata)" )) # Average array array_mean = F.udf(lambda x: float(np.mean(x)), T.FloatType()) sent_df = df_product.select(array_mean("product").alias("value")) return sent_df
def etl_us_cities_demographics(spark, input_dir, output_dir): """Clean the us cities demograpgics data""" # this data set is clean # load data data_input_full_file_path = f'{input_dir}/us-cities-demographics.csv' us_cities_demographics_spark_df = spark.read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1", sep=';') \ .load(data_input_full_file_path) us_cities_demographics_spark_df = us_cities_demographics_spark_df \ .withColumnRenamed("City", "city") \ .withColumnRenamed("State", "state") \ .withColumnRenamed("Median Age", "median_age") \ .withColumnRenamed("Male Population", "male_population") \ .withColumnRenamed("Female Population", "female_population") \ .withColumnRenamed("Total Population", "total_population") \ .withColumnRenamed("Number of Veterans", "num_of_veterans") \ .withColumnRenamed("Foreign-born", "foreign_born") \ .withColumnRenamed("Average Household Size", "avg_house_size") \ .withColumnRenamed("State Code", "state_code") \ .withColumnRenamed("Race", "race") \ .withColumnRenamed("Count", "count") \ .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code')))) data_output_full_file_path = f'{output_dir}/us-cities-demographics.parquet' us_cities_demographics_spark_df \ .write \ .options(encoding="ISO-8859-1") \ .mode('overwrite') \ .parquet(data_output_full_file_path)
def etl_airport_code(spark, input_dir, output_dir): """Clean the airport code data""" # load data airport_code_data_input_full_file_path = f'{input_dir}/airport-codes_csv.csv' airport_code_spark_df = spark.read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1") \ .load(airport_code_data_input_full_file_path) airport_code_spark_df = airport_code_spark_df \ .withColumnRenamed('name', 'airport_name') \ .filter(F.col('iso_country') == 'US') # split iso_region column into Latitude and Longitude split_iso_region = F.split(airport_code_spark_df['iso_region'], '-') airport_code_spark_df = airport_code_spark_df \ .withColumn('region', split_iso_region.getItem(1)) \ .withColumn('municipality_region', F.concat_ws(', ', F.upper(F.col('municipality')), F.upper(F.col('region')))) new_airport_code_spark_df = airport_code_spark_df \ .drop('iso_region') \ .drop('coordinates') data_output_full_file_path = f'{output_dir}/airport-codes.parquet' new_airport_code_spark_df \ .write \ .options(encoding="ISO-8859-1") \ .mode('overwrite') \ .parquet(data_output_full_file_path)
def nest(input_cols, output_col, shape="string", separator=""): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ df = self if has_(input_cols, F.Column): # Transform non Column data to lit columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] else: columns = parse_columns(self, input_cols) if shape is "vector": columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) vector_assembler = VectorAssembler( inputCols=columns, outputCol=output_col) df = vector_assembler.transform(df) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def load_world_news(spark: SparkSession, input_path: str, bucket: str) -> None: """ Loads the world news to S3. There are 25 news pieces per day, which are concatenated in a single row, using the asterisk (*) as a separator. :param spark: spark session - must be configured to use S3 (AWS credentials, file system) :param input_path: input path from where the files should be read. :param bucket: S3 bucket where to write data :return: None """ df_world_news = spark.read.option("header", "true").option("delimiter", ",").csv(input_path) df_world_news = df_world_news.groupBy('Date').agg( functions.concat_ws('*', functions.collect_list( functions.col('News'))).alias('news')) # Filter out badly formatted Date columns df_world_news = df_world_news \ .filter(functions.to_date(functions.col('Date')).isNotNull()) \ .orderBy('Date', ascending=False) df_world_news = df_world_news.withColumn('year', functions.year('Date')) df_world_news \ .coalesce(1) \ .write \ .partitionBy('year') \ .mode('overwrite') \ .csv(f"s3a://{bucket}/data/news")
def preprocessDF(self, df, cols): tokenize_udf = udf(lambda line: lineTokenizer(line),ArrayType(StringType(), False)) df_joinkey = df.withColumn("joinKey", tokenize_udf(concat_ws(' ', cols[0], cols[1]).alias('joinKey'))).cache() #can we remove hardcoding of the cols!!!!! return df_joinkey
def prepareDatasets(sc, spark): buisHeader = ['business_id', 'name', 'neighborhood', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'] buis = sc.textFile(datapath+'yelp_business.csv', use_unicode=False) buis = buis.filter(lambda row: not row.startswith('business_id,name'))\ .map(lambda row: re.findall(r'(?:[^,"]|"(?:\\.|[^"])*")+', row.replace(',,', ', ,')))\ .map(lambda row: map(lambda x: x.replace('"', ''), row))\ .map(lambda row: dict(zip(buisHeader, row)))\ .filter(lambda row: row['business_id'] and row['longitude'] and row['latitude'])\ .filter(lambda row: row['business_id'].strip() and row['longitude'].strip() and row['latitude'].strip())\ .toDF() buis = buis.select('business_id', 'name', 'city', 'state', 'postal_code', 'categories', buis['latitude'].cast('float'), buis['longitude'].cast('float'), buis['stars'].cast('float'), buis['review_count'].cast('int'), buis['is_open'].cast('int'))\ .dropna(how='any', subset=['business_id','longitude', 'latitude']) def reviews_mapper(index, lines): import csv reader = csv.reader(lines) if index==0: lines.next() for row in reader: if len(row) == 9 and len(row[1])==22: yield row reviewsHeader = ["review_id","user_id","business_id","stars","date","text","useful","funny","cool"] reviews = sc.textFile(datapath+'yelp_review.csv', use_unicode=False)\ .mapPartitionsWithIndex(reviews_mapper)\ .map(lambda x: dict(zip(reviewsHeader, x)))\ .toDF() reviews = reviews.select( "review_id", "user_id", "business_id", "text", reviews["stars"].cast('float'), reviews["date"].cast('date'), reviews["useful"].cast('int'), reviews["funny"].cast('int'), reviews["cool"].cast('int'))\ .filter(reviews.text.isNotNull())\ .filter(reviews.business_id.isNotNull()) reviews = reviews.alias('a').join(buis.alias('b'), sf.col('b.business_id') == sf.col('a.business_id'))\ .select('b.*','a.text') #,'a.user_id') reviews = reviews.where( 'longitude > {:f} and longitude < {:f} and latitude > {:f} and latitude < {:f}'\ .format(westAMER, eastAMER, southAMER, northAMER) ).cache() id_text = reviews.select('business_id', 'text')\ .groupBy('business_id').agg(sf.concat_ws(' ', sf.collect_list("text")).alias('text_concat')) reviews = reviews.drop(reviews.text)\ .select('business_id','categories','state', 'stars')\ .alias('a').join(id_text.alias('b'), sf.col('b.business_id') == sf.col('a.business_id'))\ .select('a.*','b.text_concat')\ .distinct()\ .withColumnRenamed('text_concat', 'text') # some data cleansing: reviews = reviews.withColumn('text', sf.regexp_replace(reviews.text, '\\/', '/')) def cleanse(text): re_punc = re.compile('[' + re.escape(punctuation) + '0-9\\n\\t\\r]') re_spc = re.compile('[ ]+') # get rid of extra spaces return re_spc.sub(' ', re_punc.sub(" ", text)) cleanser = sf.udf(lambda x: cleanse(x)) reviews = reviews.withColumn('text', cleanser('text')) # tokinizing and removing stop words: import pyspark.ml.feature as sparkml from pyspark.ml import Pipeline tokenizer = sparkml.Tokenizer(inputCol="text", outputCol="words") swremover = sparkml.StopWordsRemover(inputCol='words', outputCol='words_clean') pipeline = Pipeline(stages=[tokenizer, swremover]) reviews = pipeline.fit(reviews).transform(reviews) reviews = reviews.drop('text', 'words') return reviews.cache()
if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: pretty-cluster.py <input> <page-out> <book-out>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Proteus Pages") sqlContext = SQLContext(sc) raw = sqlContext.read.load(sys.argv[1]) cols = set(raw.columns) idcols = [col(x) for x in ['identifier', 'issue', 'book'] if x in cols] df = raw.withColumn('identifier', regexp_replace(coalesce(*idcols), '[^A-Za-z0-9]+', '')) counts = df.groupBy('identifier').count().select(col('identifier'), col('count').alias('imagecount')) appendID = udf(lambda book, text: '%s <archiveid tokenizetagcontent="false">%s</archiveid>' % (text, book)) renamed = df.join(counts, 'identifier')\ .drop('regions')\ .withColumn('pageNumber', col('seq'))\ .withColumn('name', concat_ws('_', col('identifier'), col('seq')))\ .withColumn('text', regexp_replace(col('text'), '\\n', '<br>\\\n')) renamed.withColumn('text', appendID(col('identifier'), col('text')))\ .write.format('json').save(sys.argv[2]) renamed.rdd.groupBy(lambda r: r.identifier).map(pageCat).toDF()\ .write.format('json').save(sys.argv[3]) sc.stop()
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.functions import lit, concat, concat_ws, regexp_replace if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: trove-load.py <input json> <output parquet>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Trove Load") sqlContext = SQLContext(sc) raw = sqlContext.read.json(sys.argv[1]) df = raw.na.drop(subset=['id', 'fulltext']).dropDuplicates(['id']) df.select(concat(lit('trove/'), df.id).alias('id'), concat_ws('/', lit('trove'), df.titleId, df.date).alias('issue'), concat(lit('trove/'), df.titleId).alias('series'), df.date, df.firstPageId, df.firstPageSeq.cast('int').alias('seq'), df.heading.alias('title'), df.category, regexp_replace(regexp_replace(df.fulltext, '&', '&'), '<', '<').alias('text'))\ .write.save(sys.argv[2]) sc.stop()
# word_tokenize uses PunktSentenceTokenizer first, then # treebank_word_tokenizer on those so can get nested # lists. #return nltk.tokenize.word_tokenize(s) # this is just the treebank tokenizer return [word for word in t.tokenize(s) if word not in stopwords_set] udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType())) (idb_df .select(sql.concat_ws(" ", idb_df["data.dwc:occurrenceRemarks"], idb_df["data.dwc:eventRemarks"], idb_df["data.dwc:fieldNotes"] ) .alias("note"), idb_df["uuid"] ) .where(sql.column("note") != "") .withColumn("tokens", udf_tokenize(sql.column("note"))) .select(sql.column("uuid"), sql.explode(sql.column("tokens")).alias("token") ) .groupBy(sql.column("uuid"), sql.column("token")) .count() .write .mode("overwrite") .parquet("/guoda/data/idigbio-{}-tf.parquet".format(idb_df_version)) )
print("Usage: pretty-cluster.py <metadata> <input> <output> [<query>]", file=sys.stderr) exit(-1) sc = SparkContext(appName="Prettyprint Clusters") sqlContext = SQLContext(sc) outpath = sys.argv[3] (outputFormat, outputOptions) = guessFormat(outpath, "json") ## Should do more field renaming in meta to avoid clashing with fields in raw. meta = sqlContext.read.json(sys.argv[1])\ .dropDuplicates(['series']) constructURL = udf(lambda url, corpus, id, regions: formatURL(url, corpus, id, regions)) df = sqlContext.read.load(sys.argv[2]) \ .withColumnRenamed('title', 'doc_title')\ .withColumnRenamed('lang', 'doc_lang')\ .withColumn('url', constructURL(col('page_access'), col('corpus'), col('id'), col('regions')))\ .drop('locs').drop('pages').drop('regions')\ .join(meta, 'series', 'left_outer') filtered = df.join(df.filter(sys.argv[4]).select('cluster').distinct(), 'cluster') \ if len(sys.argv) >= 5 else df filtered.withColumn('lang', concat_ws(',', col('lang'))) \ .orderBy(desc('size'), 'cluster', 'date', 'id', 'begin')\ .write.format(outputFormat).options(**outputOptions).save(outpath) sc.stop()
sqlContext = SQLContext(sc) # people DF RDD = sc.textFile(datapath+"people.csv",minPartitions = 6,use_unicode = False) header = RDD.first() RDD = RDD.filter(lambda x: x!=header) PeopleDF = RDD.map(lambda x: (x.split(",")[0],x,float(x.split(",")[-1]))).toDF(["PeopleID","PeopleFeats","numeric"]).cache() print "number of distinct people: " print PeopleDF.count() # train DF RDD = sc.textFile(datapath+"act_train.csv",minPartitions = 6,use_unicode = False) header = RDD.first() RDD = RDD.filter(lambda x: x!=header) trainDF = RDD.map(lambda x: (x.split(",")[0],x)).toDF(["PeopleID","trainFeats"]) trainDF = trainDF.join(PeopleDF,"PeopleID","left_outer") trainDF = trainDF.select("PeopleID",concat_ws(',',trainDF.PeopleFeats,trainDF.trainFeats).alias("text")) trainRDD = trainDF print "number of train samples: " print trainDF.count() RDD = sc.textFile(datapath+"act_test.csv",minPartitions = 6,use_unicode = False) header = RDD.first() RDD = RDD.filter(lambda x: x!=header) testDF = RDD.map(lambda x: (x.split(",")[0],x)).toDF(["PeopleID","trainFeats"]) testDF = testDF.join(PeopleDF,"PeopleID","left_outer") testDF = testDF.select("PeopleID",concat_ws(',',testDF.PeopleFeats,testDF.trainFeats).alias("text")) print "number of test samples: " print testDF.count() """