def main(lowtopic_dist, hightopic_dist, output_folder): lowtopic_df = spark.read.csv(lowtopic_dist, header=True) hightopic_df = spark.read.csv(hightopic_dist, header=True) df_init_low = lowtopic_df.select( lowtopic_df['BusinessID'], lowtopic_df['BusinessName'], join_udf( lowtopic_df['topic_distribution']).alias('topic_distribution')) df_expl_low = df_init_low.select(df_init_low['BusinessID'], df_init_low['BusinessName'], \ functions.posexplode(functions.split(df_init_low['topic_distribution'], ','))) \ .withColumnRenamed('pos', 'TopicID') \ .withColumnRenamed('col', 'TopicDistribution') df_expl_low.coalesce(1).write.csv(output_folder + '/Topic_low_topic_exploded', header=True) df_init_high = hightopic_df.select( hightopic_df['BusinessID'], hightopic_df['BusinessName'], join_udf( hightopic_df['topic_distribution']).alias('topic_distribution')) df_expl_high = df_init_high.select(df_init_high['BusinessID'], df_init_high['BusinessName'], \ functions.posexplode(functions.split(df_init_high['topic_distribution'], ','))) \ .withColumnRenamed('pos', 'TopicID') \ .withColumnRenamed('col', 'TopicDistribution') df_expl_high.coalesce(1).write.csv(output_folder + '/Topic_high_topic_exploded', header=True)
def join_urls_and_weighted_word_vectors( split_urls_and_coefficients: DataFrame, word_vectors: DataFrame) -> DataFrame: """ Explodes split URLs to words and calculates each word's weighted word vector. :param split_urls_and_coefficients: A DataFrame of split URLs and coefficients with columns: id, url, split_url, coefficients. :param word_vectors: A DataFrame of words and vectors with columns: word, vector. :return: A DataFrame of URLs and weighted word vectors with columns: id, url, pos, word, weighted_word_vector, coefficient. """ combined_split_urls_and_coefficients = split_urls_and_coefficients \ .rdd.map(URLVectorCalculator.combine_split_urls_and_coefficients).toDF() return combined_split_urls_and_coefficients \ .select("id", "url", posexplode("split_urls_and_coefficients")) \ .withColumnRenamed("col", "word_and_coefficient") \ .rdd.map(lambda e: IDURLPosWordCoefficient( e.id, e.url, e.pos, e.word_and_coefficient[0], e.word_and_coefficient[1])) \ .toDF().join(word_vectors, "word") \ .rdd.map(lambda e: IDURLPosWordVectorCoefficient(id=e.id, url=e.url, pos=e.pos, word=e.word, coefficient=e.coefficient, weighted_word_vector=list([float(x) for x in np.multiply(e.vector.values, e.coefficient)]))) \ .toDF()
def transform_abstracts_words(dataframe): udf_function_clean = udf(generate_cleaned_abstracts, StringType()) udf_function_sentiment = udf(generate_sentiment, DoubleType()) df_abstracts = (dataframe.select( "paper_id", func.posexplode("abstract").alias("pos", "value")).select( "paper_id", "pos", "value.text").withColumn( "ordered_text", func.collect_list("text").over( Window.partitionBy("paper_id").orderBy("pos")) ).groupBy("paper_id").agg( func.max("ordered_text").alias("sentences")).select( "paper_id", func.array_join("sentences", " ").alias("abstract")).withColumn( "words", func.size(func.split( "abstract", "\s+")))) df_abstracts = df_abstracts.withColumn("clean_abstract", udf_function_clean("abstract")) df_abstracts = df_abstracts.withColumn( "sentiment_abstract", udf_function_sentiment("clean_abstract")) return df_abstracts
def get_filtered_by_month(data: DataFrame) -> DataFrame: """ Method transforms periods "start_date - end_date" to year and month number source: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-01-01|2018-01-09| +---+----------+----------+ result: +---+----------+----------+ |key| year| month| +---+----------+----------+ | 5| 2018| 1| +---+----------+----------+ """ transformed_data = data \ .withColumn("start_date", F.trunc("start_date", "month")) \ .withColumn("monthsDiff", F.months_between("end_date", "start_date")) \ .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \ .select("*", F.posexplode("repeat").alias("date", "val")) \ .withColumn("month", F.expr("add_months(datab, date)")) \ .drop('start_date', 'end_date', 'monthsDiff', 'repeat', 'val', 'date') \ .dropDuplicates() return transformed_data
def with_exploded_hits(df: DataFrame) -> DataFrame: return ( df .select( F.posexplode('hit_page_ids').alias('hit_position', 'hit_page_id'), *df.columns) .drop('hit_page_ids') .withColumn('clicked', F.expr('array_contains(click_page_ids, hit_page_id)')) .drop('click_page_ids'))
def samples_from_vcf (vcf): """ Given a dataframe consisting of filename and data, in vcf-like format, returns a list of the subjects with column number Samples come from line that starts with one # Split the line by "\t" to get an array of column names """ temp = vcf.filter(vcf.data[0:2] == '#C').select('filename',f.posexplode(f.split('data','\t'))).filter('pos > 9') return(temp.select((temp.pos - 1).alias('SAMPLE_IDX'), temp.col.alias('SAMPLE_NAME'), 'filename'))
def gts_from_impute (infile): # Get the main data and put a unique index on each variant maindata = infile.filter(infile.data[0:1] != "#") splitdata = maindata.select("filename",f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX")) gtdata1 = splitdata.select("filename", "VAR_IDX", f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","COLUMN_IDX","GTPROB").filter("COLUMN_IDX > 4") # Now, get subject ID and which GT gtdata2 = gtdata1.select("filename", "VAR_IDX", "GTPROB", "COLUMN_IDX", f.floor((gtdata1.COLUMN_IDX - 5) / 3).alias("SAMPLE_IDX"), ((gtdata1.COLUMN_IDX - 5) % 3).cast(StringType()).alias("GT_IDX")) gtdata3 = rkutil.withColumnsRenamed(gtdata2.groupBy("filename","VAR_IDX","SAMPLE_IDX").pivot("GT_IDX",["0","1","2"]).agg(f.collect_list("GTPROB")), ["0","1","2"],["c0","c1","c2"]) gtdata4 = gtdata3.select("filename","VAR_IDX","SAMPLE_IDX", f.element_at(gtdata3.c0, 1).cast(FloatType()).alias("P11"), f.element_at(gtdata3.c1, 1).cast(FloatType()).alias("P12"), f.element_at(gtdata3.c2, 1).cast(FloatType()).alias("P22")) return(gtdata4)
def populate(cls, spark=None): sdf = (TrainClickGrouped.query(spark).drop( 'joblist', 'rel_list', 'id_list', 'jobno_list', 'action_list', 'text_score').select( '*', f.posexplode('job_rel_list').alias( 'pos_in_list', 'job_rel_in_list')).drop('job_rel_list').select( '*', 'job_rel_in_list.*').drop('job_rel_in_list')) cls.write(sdf, compression='snappy')
def prepData(category): df = spark.read.parquet("gs://core_bucket_abell/aspect_ranking/extracted_aspects/Extracted " + category + ".parquet") #explode aspects spread_df = df.select("*", F.posexplode("indiv_aspects").alias("pos", "split_aspect")) # get unique by category/review and split aspect - remove blank aspects unique_cat_aspect = spread_df.groupBy(*('asin', "reviews", "main_cat", "categories", "scores", "split_aspect", "idx")).count()\ .withColumn("split_aspect", F.lower(F.col("split_aspect")))\ .withColumn("split_aspect", rmStop_udf(F.col("split_aspect")))\ .filter("split_aspect != ''") return unique_cat_aspect
def populate(cls, spark=None): sdf = ( TestsetClickProcessed.query(spark) # .drop('joblist', 'rel_list', 'id_list', 'jobno_list', 'action_list') .select('*', f.posexplode('joblist').alias('pos_in_list', 'job')).withColumn( 'rel', f.lit(0)).drop('joblist') # .select('*', 'job_in_list.*') # .drop('job_in_list') ) cls.write(sdf, compression='snappy')
def main(self, sc: SparkContext, *args): os.makedirs(os.path.join(DATASET_DIR, clean_filename(self.filter_city)), exist_ok=True) spark = SparkSession(sc) train_df = spark.read.csv(self.input()[0].path, header=True, inferSchema=True) train_df = train_df.withColumn("impressions_array", F.split(train_df.impressions, "\|")) meta_df = spark.read.csv(self.input()[1].path, header=True, inferSchema=True) # Filter dataset if self.filter_city != 'all': if self.filter_city == 'recsys': train_df = train_df.filter(train_df.city.isin(RECSYS_CITIES)) else: train_df = train_df.filter(train_df.city == self.filter_city) # Filter reference reference_df = train_df.select("reference").distinct() # Filte item impressions item_id_df = train_df.select( posexplode("impressions_array").alias( "pos_item_idx", "reference")).select("reference").distinct() #raise(Exception(train_df.show())) item_id_df = item_id_df.union(reference_df).select( "reference").distinct() meta_df = meta_df.join( item_id_df, meta_df.item_id == item_id_df.reference).select( "item_id", "properties") if self.sample_size > 0: train_df = train_df.sort("timestamp", ascending=False).limit(self.sample_size) # Save train_df.toPandas().to_csv(self.output()[0].path, index=False) meta_df.toPandas().to_csv(self.output()[1].path, index=False)
def gts_from_vcf (vcf, *variant): """ Given a VCF file, and an optional variant file to filter on, returns the gt matrix """ if (len(variant)==0): maindata = vcf.filter(vcf.data[0:1] != "#") else: var = variant[0].withColumnRenamed('filename','filename2') maindata = vcf.join(var,(var.filename2 == vcf.filename) & (var.VAR_IDX == vcf.lineid), "inner") splitdata = maindata.select(maindata.filename,f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX")) # Explode out the genotype data, but leave out the first nine columns gt = splitdata.select('filename','VAR_IDX', f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","SAMPLE_IDX","GT").filter("SAMPLE_IDX >= 9") return(gt)
def _transform_rec_info(self, rec_info, item_ids_dataset): import pyspark.sql.functions as F if self.output_user_embeddings: user_embeddings = rec_info.select( self.increasing_id_column_name, self.recommendation_info_column_name + '.user_embedding') rec_info = rec_info.withColumn( self.recommendation_info_column_name, F.arrays_zip(self.recommendation_info_column_name + '.indices', self.recommendation_info_column_name + '.distances')) rec_info = rec_info.select( self.increasing_id_column_name, (F.posexplode(self.recommendation_info_column_name).alias( 'pos', self.recommendation_info_column_name))) rec_info = rec_info.select( self.increasing_id_column_name, 'pos', F.col(self.recommendation_info_column_name + '.0').alias('index'), F.col(self.recommendation_info_column_name + '.1').alias('distance')) rec_info = rec_info.join(item_ids_dataset, F.col('index') == F.col('id')) w = pyspark.sql.Window.partitionBy( self.increasing_id_column_name).orderBy('pos') if self.output_item_embeddings: rec_info = rec_info.select( self.increasing_id_column_name, 'pos', (F.struct('name', 'distance', 'item_embedding').alias( self.recommendation_info_column_name))) else: rec_info = rec_info.select( self.increasing_id_column_name, 'pos', (F.struct('name', 'distance').alias( self.recommendation_info_column_name))) rec_info = rec_info.select( self.increasing_id_column_name, 'pos', (F.collect_list( self.recommendation_info_column_name).over(w).alias( self.recommendation_info_column_name))) rec_info = rec_info.groupBy(self.increasing_id_column_name).agg( F.max(self.recommendation_info_column_name).alias( self.recommendation_info_column_name)) if self.output_user_embeddings: rec_info = rec_info.join(user_embeddings, self.increasing_id_column_name) return rec_info
def get_filtered_by_week(data: DataFrame) -> DataFrame: """ Method transforms periods "start_date - end_date" to year and week number of year source: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-01-01|2018-01-09| +---+----------+----------+ result: +---+----------+----------+ |key| year| week_num| +---+----------+----------+ | 5| 2018| 1| | 5| 2018| 2| +---+----------+----------+ """ max_week_number = 53 transformed_data = data \ .withColumn('start_week', F.weekofyear('start_date')) \ .withColumn('weeks_diff', F.ceil(F.datediff(F.col('end_date'), F.col('start_date')) / 7)) \ .withColumn("year", F.year("start_date")) \ .withColumn("repeat", F.expr("split(repeat(',', weeks_diff), ',')")) \ .select("*", F.posexplode("repeat").alias("week_add", "val")) \ .withColumn('total_week_num', F.col('start_week') + F.col('week_add')) \ .withColumn('add_year', (F.col('total_week_num') / max_week_number).cast(IntegerType())) \ .withColumn('total_week_num', F.col('total_week_num') - (max_week_number * F.col('add_year'))) \ .withColumn('week_num', F.when(F.col('total_week_num') == 0, 1) .otherwise(F.col('total_week_num'))) \ .withColumn('year', F.col('year') + F.col('add_year')) \ .drop('start_date', 'end_date', 'week_add', 'repeat', 'val', 'date', 'add_year', 'weeks_diff', 'total_week_num') \ .dropDuplicates() return transformed_data
def Ranking_evaluator (spark,model, val, metric_type): val.createOrReplaceTempView('val') val_user = spark.sql('SELECT DISTINCT user_id FROM val') #val_user = val.select('user_id').distinct() val_rec = model.recommendForUserSubset(val_user,500) #val_rec.printSchema() val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations') val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating')) w= Window.partitionBy('user_id') val_recrank=val_rec.select('user_id',f.collect_list('book_id').over(w).alias('rec_rank')).sort('user_id').distinct() val = val.sort(f.desc('rating')) val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct() scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner') rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect() ranks = spark.sparkContext.parallelize(rankLists) metrics = RankingMetrics(ranks) MAP = metrics.meanAveragePrecision Precision = metrics.precisionAt(500) NDCG = metrics.ndcgAt(500) if metric_type == 'Precision': return Precision, {'MAP': MAP,'NDCG': NDCG} elif metric_type == 'MAP': return MAP, {'Precision': Precision,'NDCG': NDCG} elif metric_type == 'NDCG': return NDCG, {'MAP': MAP, 'Precision': Precision} else: return None
)\ .show(5,False) companiesJson = [ """{"company":"NewCo","employees":[{"firstName":"Sidhartha","lastName":"Ray"},{"firstName":"Pratik","lastName":"Solanki"}]}""", """{"company":"FamilyCo","employees":[{"firstName":"Jiten","lastName":"Pupta"},{"firstName":"Pallavi","lastName":"Gupta"}]}""", """{"company":"OldCo","employees":[{"firstName":"Vivek","lastName":"Garg"},{"firstName":"Nitin","lastName":"Gupta"}]}""", """{"company":"ClosedCo","employees":[]}""" ] companiesRDD = sparkSession.sparkContext.parallelize(companiesJson) companiesDF = sparkSession.read.json(companiesRDD) companiesDF.show(5, False) companiesDF.printSchema() employeeDfTemp = companiesDF.select("company", explode("employees").alias("employee")) employeeDfTemp.show() employeeDfTemp2 = companiesDF.select( "company", posexplode("employees").alias("employeePosition", "employee")) employeeDfTemp2.show() employeeDf = employeeDfTemp.select("company", expr("employee.firstName as firstName")) employeeDf.select("*", when(col("company") == "FamilyCo", "Premium") .when(col("company") == "OldCo", "Legacy") .otherwise("Standard").alias("Tier"))\ .show(5,False)
def mapping_sequence(sdf, join_sdf, column='item_id_collect__sub', join_column=None, how='inner', dtype=ArrayType(ArrayType(FloatType())), fillna=None, drop=False, skew_quantile=0.9, orderby_flag=True): """ Paramter: ---------- sdf: pyspark dataframe to be processed join_sdf: pyspark dataframe which be joined by sdf column: str of sdf's col to be mapping join_column: str of join_sdf's col to be joined by sdf how: str of the join mode which sdf.join(join_sdf) dtype: pyspark.sql.types of the return values Return: ---------- sdf: pyspark dataframe of sdf.join(join_sdf) by mapping_sequence Example: ---------- sdf=mapping_sequence(sdf,join_sdf,column='item_id_collect__sub') """ if join_column is None: join_column = column resdtype = len(re.findall(re.compile(r'(ArrayType)'), str(dtype))) if resdtype == 1: def array_eval_udf(x): return F.col(x).cast(dtype) else: array_eval_udf = F.udf(lambda x: [eval(i) for i in x], dtype) join_other_list = [ c + '__explode' for c in join_sdf.columns if c != join_column ] def other_list_fun(all_, pop_): return list(filter(lambda x: x not in pop_, all_)) temp1 = sdf.withColumn('increasing_id', F.monotonically_increasing_id()) temp2 = temp1.select( F.posexplode(column).alias(column + '__explodeindex', column + '__explodecol'), *temp1.columns) join_sdf = eval("join_sdf.withColumnRenamed" + ".withColumnRenamed".join([ f"('{c}','%s')" % (f"{c}__explode" if c != join_column else column + '__explodecol') for c in join_sdf.columns ])) if skew_quantile is not None: # temp2=sdf_join_by_dataskew(temp2,join_sdf,raw_column=column+'__explodecol',how=how) temp2=sdf_join_by_dataskew_quantile(temp2,join_sdf,raw_column=column+'__explodecol',how=how,\ skew_quantile=skew_quantile) else: temp2 = temp2.join(join_sdf, on=column + '__explodecol', how=how) if fillna is not None: temp2 = temp2.fillna( dict(zip(join_other_list, [fillna] * len(join_other_list)))) temp2 = temp2.select( other_list_fun(temp2.columns, join_other_list) + [cast2str_udf(c).alias(c) for c in join_other_list]) # temp2.cache() collect_temp2 = collect_orderby(temp2, join_other_list, groupby=temp1.columns, orderby=column + '__explodeindex' if orderby_flag else None) # return collect_temp2,join_other_list # temp2.unpersist() res_other = [ array_eval_udf(c + '__collect').alias(c + '__collect') for c in join_other_list ] sdf = collect_temp2.select( other_list_fun(collect_temp2.columns, [c + '__collect' for c in join_other_list]) + res_other).drop('increasing_id') # sdf = sdf.select([c+'__collect' for c in join_other_list]+sdf.columns).drop('increasing_id') for c in join_other_list: sdf = sdf.withColumnRenamed(c + '__collect', c.split('__explode')[0] + '__collect') if drop: sdf = sdf.drop(column) return sdf
def uniprot_mapping_to_parquet(input_id_mapping, uniprot_fasta_folder, out_path): """ This method read a tab delimited idmapping file from uniprot and a set of FASTA files from a directory to generate a parquet file with the following five columns: - AC: Uniprot Protein Accession - ID: Uniprot Protein Name - Gene: Gene name for the corresponding protein - Organism: Organism for the corresponding protein - EvidenceLevel: Protein evidence level following uniprot guidelines :param input_id_mapping: input id tab-delimited mapping file from uniprot :param uniprot_fasta_folder: folder containing fasta files from uniprot (ext fasta.gz) :return: """ # Read the id mapping file and keep only the Unprot accession and the Protein name sql_context = SparkSession.builder.getOrCreate() print("======= processing:" + input_id_mapping) df = sql_context.read.csv(path=input_id_mapping, sep='\t', header=False) df = df.select(df.columns[:21]).toDF( "AC", "ID", "GeneID", "RefSeq", "GI", "PDB", "GO", "UniRef100", "UniRef90", "UniRef50", "UniParc", "PIR", "NCBI-taxon", "MIM", "UniGene", "PubMed", "EMBL", "EMBL-CDS", "Ensembl", "Ensembl_TRS", "Ensembl_PRO") df_uniprot_id = df.select(col("AC"), col("ID")) df_uniprot_id.show(n=30, truncate=False) df_ref_seq = df.select(col("AC"), col("RefSeq")).withColumnRenamed("RefSeq", "ID") df_ref_seq.show(n=30, truncate=False) df_ensembl_seq = df.select(col("AC"), col("Ensembl_PRO")) df_ensembl_seq = df_ensembl_seq.select( "AC", f.split("Ensembl_PRO", "; ").alias("letters"), f.posexplode(f.split("Ensembl_PRO", "; ")).alias("pos", "val")).select("AC", "val") df_ensembl_seq = df_ensembl_seq.withColumnRenamed("val", "ID") # df_ensembl_seq.show(n=40, truncate=False) # create list of dataframes dfs = [df_uniprot_id, df_ref_seq, df_ensembl_seq] # create merged dataframe df = reduce(DataFrame.unionAll, dfs) # df_ensembl_seq.show(n=3000, truncate=False) df_uniprot = df.select([trim(col(c)).alias(c) for c in df.columns]) df_uniprot.show(n=30, truncate=False) print("======= processing:" + uniprot_fasta_folder) uniprot_fasta = sql_context.read.text(uniprot_fasta_folder + "*.fasta.gz") uniprot_fasta = uniprot_fasta.filter( lower(uniprot_fasta.value).contains(">")) cols = [ "Accession", "ProteinName", "Gene", "Organism", "TaxId", "EvidenceLevel" ] uniprot_fasta = uniprot_fasta.rdd.map(uniprot_header_to_df).toDF().toDF( *cols) # uniprot_fasta.show(n=30) complete_uniprot = df_uniprot.join( uniprot_fasta, df_uniprot.AC == uniprot_fasta.Accession, "left").drop(uniprot_fasta.Accession) # This delete duplicated records by accession complete_uniprot = complete_uniprot.dropDuplicates(['AC', 'ID']) complete_uniprot = complete_uniprot.filter(complete_uniprot.ID.isNotNull()) complete_uniprot.write.parquet(out_path, mode='append', compression='snappy') print_df = sql_context.read.parquet(out_path) print_df.show(n=3000, truncate=False)
def main(self, sc: SparkContext, *args): os.makedirs(DATASET_DIR, exist_ok=True) #parans min_itens_per_session = 2 max_itens_per_session = self.max_itens_per_session min_itens_interactions = self.min_itens_interactions # Tupla interactions max_relative_pos = self.max_relative_pos spark = SparkSession(sc) df = spark.read.option("delimiter", ";").csv(BASE_DATASET_FILE, header=True, inferSchema=True) df = df.withColumnRenamed("sessionId", "SessionID")\ .withColumnRenamed("eventdate", "Timestamp")\ .withColumnRenamed("itemId", "ItemID")\ .withColumn("Timestamp", (col("Timestamp").cast("long") + col("timeframe").cast("long")/1000).cast("timestamp"))\ .orderBy(col('Timestamp'), col('SessionID'), col('timeframe')).select("SessionID", "ItemID", "Timestamp", "timeframe") # Drop duplicate item in that same session df = df.dropDuplicates(['SessionID', 'ItemID']) # filter date max_timestamp = df.select(max( col('Timestamp'))).collect()[0]['max(Timestamp)'] init_timestamp = max_timestamp - timedelta(days=self.sample_days) df = df.filter(col('Timestamp') >= init_timestamp).cache() df = df.groupby("SessionID").agg( max("Timestamp").alias("Timestamp"), collect_list("ItemID").alias("ItemIDs"), count("ItemID").alias("total")) # Filter Interactions df = df.filter(df.total >= min_itens_per_session).cache() # Filter position in list df_pos = df.select( col('SessionID').alias('_SessionID'), posexplode(df.ItemIDs)) # Explode A df = df.withColumn("ItemID_A", explode(df.ItemIDs)) df = df.join(df_pos, (df.SessionID == df_pos._SessionID) & (df.ItemID_A == df_pos.col))\ .select('SessionID', 'Timestamp', 'ItemID_A', 'pos', 'ItemIDs')\ .withColumnRenamed('pos', 'pos_A') # Explode B df = df.withColumn("ItemID_B", explode(df.ItemIDs)) df = df.join(df_pos, (df.SessionID == df_pos._SessionID) & (df.ItemID_B == df_pos.col))\ .withColumnRenamed('pos', 'pos_B') df = df.withColumn("relative_pos", abs(df.pos_A - df.pos_B)) # Filter distincts df = df.select('SessionID', 'Timestamp', 'ItemID_A', 'pos_A', 'ItemID_B', 'pos_B', 'relative_pos')\ .distinct()\ .filter(df.ItemID_A != df.ItemID_B).cache() # # Filter duplicates # udf_join = F.udf(lambda s,x,y : "_".join(sorted([str(s), str(x),str(y)])) , StringType()) # df = df.withColumn('key', udf_join('SessionID', 'ItemID_A','ItemID_B')) # df = df.dropDuplicates(["key"]) # Calculate and filter probs ocorrence df_probs = self.get_df_tuple_probs(df) df = df.join(df_probs, (df.ItemID_A == df_probs._ItemID_A) & (df.ItemID_B == df_probs._ItemID_B)) # Add positive interactoes df_positive = self.add_positive_interactions(df) # Filter confidence df = df.filter(col("total_ocr_dupla") >= min_itens_interactions)\ .filter(col("relative_pos") <= max_relative_pos)\ .filter(col("pos_A") <= self.max_itens_per_session) # df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'pos_A', # 'ItemID_B', 'pos_B', 'relative_pos', # 'total_ocr', 'total_ocr_dupla', 'prob', 'sub_a_b')\ # .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right? df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'ItemID_B', 'relative_pos', 'total_ocr', 'total_ocr_dupla')\ .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right? df.select("ItemID_A").dropDuplicates().toPandas().to_csv( self.output()[2].path, index_label="item_idx") df.select("SessionID").dropDuplicates().toPandas().to_csv( self.output()[3].path, index_label="session_idx") df.write.parquet(self.output()[0].path) df_positive.to_csv(self.output()[1].path)
import pyspark.sql.functions as f #import yelp_business.csv as dataframe yelp_b = spark.read.option("header", "true").option("quote", "\"").option( "escape", "\"").csv("/Project_BigData/Data/yelp_business.csv") #yelp_diff categories has rows which has distinct columns as category yelp_diff_categories = yelp_b.select( "business_id", f.split("categories", ";").alias("categories"), f.posexplode(f.split("categories", ";")).alias("pos", "val")) #Shows all the categories with count in descending order yelp_categories = yelp_diff_categories.groupBy("val").count().orderBy( "count", ascending=False) yelp_categories.write.csv( "/PopularBusinesses") # Logic to write the output as CSV
def main(self, sc: SparkContext, *args): os.makedirs(DATASET_DIR, exist_ok=True) spark = SparkSession(sc) df = spark.read.csv(self.input()[0].path, header=True, inferSchema=True) item_idx_df = spark.read.csv(self.input()[1][0].path, header=True, inferSchema=True) item_idx_dict = item_idx_df.toPandas().set_index( 'item_id')['item_idx'].to_dict() item_idx_most_dict = self.most_popular_array(df) print(item_idx_most_dict) # Expand impressions interactions df = df.withColumn("impressions", to_array_int_udf(df.impressions)).\ withColumn("prices", to_array_float_udf(df.prices)) sort_array_by_dict = udf( lambda x: sorted(x, key=lambda _x: item_idx_most_dict[_x] if _x in item_idx_most_dict else 0, reverse=True), ArrayType(IntegerType())) df = df.withColumn("list_mean_price", array_mean(df.prices)) # Convert item_id to item_idx in impressions to_item_idx_from_dict_udf = udf( lambda x: [item_idx_dict[i] if i in item_idx_dict else 0 for i in x], ArrayType(IntegerType())) df = df.withColumn("impressions", to_item_idx_from_dict_udf(df.impressions)) df = df.withColumn("impressions_popularity", sort_array_by_dict(df.impressions)) # Explode df = df.select( "*", posexplode("impressions").alias("pos_item_idx", "item_idx")) df = df.withColumn("price", df["prices"].getItem(df.pos_item_idx)).\ withColumn("popularity", df["impressions_popularity"].getItem(df.pos_item_idx)).\ withColumn("clicked", when(df.action_type_item_idx == df.item_idx, 1.0).otherwise(0.0)).\ withColumn("view", lit(1.0)).orderBy('timestamp') df = df.withColumn("is_first_in_impression", df.pos_item_idx == lit(0)) df = df.withColumn("first_item_idx", df["impressions"].getItem(lit(0))) df = df.withColumn("popularity_item_idx", df["impressions_popularity"].getItem(lit(0))) df = df.withColumn("diff_price", df.price - df.list_mean_price) # Quartile Discretize columns = ["price", "popularity"] for c in columns: discretizer = QuantileDiscretizer(numBuckets=10, inputCol=c, outputCol=c + "_BIN") df = discretizer.fit(df).transform(df) # add feature 'user_view', 'hist_views' win_user_item = Window.partitionBy('user_idx', 'item_idx').orderBy('timestamp') \ .rangeBetween(Window.unboundedPreceding, -1) win_user = Window.partitionBy('user_idx').orderBy('timestamp')\ .rangeBetween(Window.unboundedPreceding, -1) df = df.withColumn("user_view", F.sum(col("view")).over(win_user)).\ withColumn("hist_views", F.sum(col("view")).over(win_user_item)).\ fillna(0, subset=["user_view", "hist_views"]) df = df.withColumn("user_view", df.user_view + lit(1)).\ withColumn("hist_views", df.hist_views + lit(1)) df = df.withColumn("ps", df.hist_views / df.user_view) print(df.toPandas().head()) df.orderBy('timestamp', "pos_item_idx").toPandas().to_csv(self.output().path, index=False)
def main(): logfile = "{0}/log/sentence_segment.log".format(PROJECT_DIR) utils.setup_logging(logfile, logging.INFO) logging.info(sys.path) args = utils.parse_arguments() batchsize = args.batchsize start_time = time.time() # set_env_vars() spark_session = spark.create_spark_session() es = elastic.check_elasticsearch() es_write_conf = spark.broadcast_es_write_config(spark_session) s3resource = aws.create_s3_resource() keys = aws.get_list_s3_files(s3resource, filetype=TEXT_FOLDER, numrows=batchsize) pbooks = s3_to_rdd(spark_session, keys) # testing_rdd = spark.sparkContext.wholeTextFiles("s3a://jason-b/{0}".format(TEXT_FOLDER), minPartitions=6, use_unicode=False) # spark.log_rdd(pbooks) pipeline = spark_nlp.setup_pipeline() books = spark_nlp.segment_sentences(spark_session, pbooks, pipeline) # Go from one book per row to one sentence per row sentences = books.select( func.monotonically_increasing_id().alias("sentence_id"), func.col("fileName"), func.posexplode("sentence.result").alias("position", "sentenceText"), func.size("sentence.result").alias("numSentencesInBook"), ) # logging.info("Num Sentences: {0}".format(sentences.count())) sentences = spark_nlp.tokenize_sentences(sentences) count_syllables_udf = func.udf( lambda s: _udf_sentence_count_syllables_sentence(s), ArrayType(IntegerType())) count_sentence_multisyllables_udf = func.udf( lambda s: sum(_udf_sentence_count_syllables_sentence(s)), IntegerType()) count_array_multisyllables_udf = func.udf( lambda a: sum(_udf_array_count_syllables_sentence(a)), IntegerType()) sentences = sentences.select( "sentence_id", "fileName", "position", "sentenceText", "numSentencesInBook", # count_sentence_multisyllables_udf('sentenceText') \ # .alias("multiSyllableCount") count_array_multisyllables_udf("words") \ .alias("multiSyllableCount"), func.size("words").alias("numWordsInSentence"), ) sentences.printSchema() # pipeline = spark_nlp.setup_sentiment_pipeline() # output = spark_nlp.sentiment_analysis(sentence_data, pipeline) # Format to load ElasticSearch sentences = sentences.select( "sentence_id", func.to_json( func.struct("sentence_id", "fileName", "position", "sentenceText", "numSentencesInBook", "multiSyllableCount", "numWordsInSentence")).alias("value")) # sentence = output.select(["sentence_id", "sentence"]).toJSON() sentences = sentences.rdd.map(lambda x: elastic.format_data(x)) # Write to ES write_rdd_to_es(sentences, es_write_conf) """ # _read_es() # sentences = sentence.rdd.map(lambda s: s.sentence[0].result) # sentences = sentence.rdd.flatMap(lambda s: s.sentence) # results = sentence.rdd.map(lambda s: s.result).zipWithUniqueId() """ spark_session.stop() end_time = time.time() logging.info("RUNTIME: {0}".format(end_time - start_time))
[[0, 1,1,1, 1],[1,5,5,5,1], [1,2,3,2,1],[0, 1, 2,1, 0], [0, 1, 2,1, 0]]], 'x_values': [[0, 40,75, 100, 150],[0, 40, 75, 100, 150]], 'y_values': [[10,40,70, 80, 110], [10,40,70, 80, 110]] }) dfSpark = spark.createDataFrame(df_arr) display(dfSpark) # COMMAND ---------- # DBTITLE 1,Transform to table step 1 from pyspark.sql.functions import posexplode # Explode the x array and the y array and use posexplode so that we know # from what index each, x- and y-value comes from. We need the index # to map pairs of x,y to the correct value/postion in the matrix containing the z-values. dfExploded1 = dfSpark.select("id", "matrixdata", "x_values",\ "y_values", posexplode("matrixdata"))\ .withColumnRenamed("pos", "y_index").withColumnRenamed("col", "values") display(dfExploded1) # COMMAND ---------- dfExploded2 = dfExploded1.select("id", "matrixdata", "x_values",\ "y_values", "y_index", "values",\ posexplode("values"))\ .withColumnRenamed("pos", "x_index").withColumnRenamed("col", "z_value") display(dfExploded2) # COMMAND ---------- # Now extract x- and y-value using our idexes for each row and then we can # use the data in an external tool as we have separate columns/dimensions
.select("vehicle_array", sort_array("vehicle_array", asc=True)) \ .show(5, False) # Use the `array_contains` function to search the array: from pyspark.sql.functions import array_contains drivers_array \ .select("vehicle_array", array_contains("vehicle_array", "Subaru")) \ .show(5, False) # Use the `explode` and `posexplode` functions to explode the array: from pyspark.sql.functions import explode, posexplode drivers_array \ .select("vehicle_array", explode("vehicle_array")) \ .show(5, False) drivers_array \ .select("vehicle_array", posexplode("vehicle_array")) \ .show(5, False) # Note that you can pass multiple names to the `alias` method: drivers_array \ .select("vehicle_array", posexplode("vehicle_array").alias("position", "column")) \ .show(5, False) # ## Maps # Use the # [create_map](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.create_map) # function to create a map: from pyspark.sql.functions import lit, create_map drivers_map = drivers \ .withColumn("vehicle_map", create_map(lit("make"), "vehicle_make", lit("model"), "vehicle_model")) \
from pyspark.sql.functions import ( concat, concat_ws, collect_list, collect_set, explode, explode_outer, flatten, greatest, least, posexplode, posexplode_outer, struct ) concat(*col) # Concatenates multiple input columns together into a single column. The function works with strings, binary and concat_ws(sep=";", *col) # speration Concatenates multiple input string columns together into a single string column, using the given separator. collect_list ## df2.agg(collect_list('age')).collect() Aggregate function: returns a list of objects with duplicates. collect_set ### Aggregate function: returns a set of objects with duplicate elements eliminated. explode ## array --> column eDF.select(explode(eDF.intlist).alias("anInt")).collect() explode_outer ### array --> column Unlike explode, if the array/map is null or empty then null flatten ## flatten array into flat Collection function: creates a single array from an array of arrays greatest # Returns the greatest value of the list of column name df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect() least(col1, col2, col3) # Returns the least value of the list of column names, skipping null values posexplode(col ) # Returns a new row for each element with position in the given array or map. eDF.select(posexplode(eDF.intlist)).collect() posexplode_outer ### explode array into new new row struct ## new struct columns, df.select(struct('age', 'name').alias("struct")).collect() #### Rows Agg operation ####################################################### from pyspark.sql.functions import ( grouping, grouping_id, first, last ) grouping # df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show() grouping_id # df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show() returns the level of grouping, first ### 1st row last ### last row
if len(sys.argv) != 3: print("Usage: parse_arxiv <input> <output_dir>", file=sys.stderr) sys.exit(-1) # Initialize the spark context. spark = SparkSession \ .builder \ .appName("ParseArxivDataset") \ .getOrCreate() # Loads in input file. It should be in format of: # Json record join_res_path = "{}/*json".format(sys.argv[1]) join_res = spark.read.json(join_res_path) output = join_res.withColumn("author", functions.explode(functions.col("authors")))\ .withColumn("category", functions.explode(functions.col("categories"))) output = output.select( "author.id", "year", "n_citation", "title", f.posexplode(f.split("category", " ")).alias("pos", "cate")) output.printSchema() output.write.json(sys.argv[2]) spark.stop()
vessel_types = [30, 1001, 1002, 21, 22, 21, 32, 52, 1023, 1025, 36, 37, 1019, 1021, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 1012, 1013, 1014, 1015, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1003, 1004, 1016, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 1017, 1024] pings = pings.where(pings.VesselType.cast("Double").isin(vessel_types)) # Invoke Function for geoHashing Ports: # see benchmarking results for cell_size_degrees cell_size_degrees = 0.4 ports = ports.withColumn("GRID_CELLS", polygon_to_gridCell_hashSet_udf(ports.POLYGON10KM, psql.lit(cell_size_degrees))) ports = ports.withColumn("GRID_CELLS", psql.expr("substring(GRID_CELLS, 2, (length(GRID_CELLS)-1))")) # Explode geoHash cells from list types to (one row per cell) to get complete set of port geoHash cells ports_geoHash = ports.select( ports.PORT_NAME, \ ports.POLYGON10KM, \ ports.LON_LAT, \ psql.split("GRID_CELLS", ", ").alias("GRID_CELLS"), \ psql.posexplode(psql.split("GRID_CELLS", ", ")).alias("pos", "val") \ ) # Take exploded points, separate into integer lat and lon for joining ports_geoHash = ports_geoHash.withColumn('GRID_POINTS', psql.regexp_extract('val', '(-)*[0-9]+ (-)*[0-9]+', 0)).withColumn('LON_CELL', psql.split('GRID_POINTS', ' ').getItem(0).cast("INT")).withColumn('LAT_CELL', psql.split('GRID_POINTS', " ").getItem(1).cast("INT")) # GeoHash Pings: pings = pings.withColumn("GRID_X", psql.floor(pings.LON.cast('Double')/cell_size_degrees)) pings = pings.withColumn("GRID_Y", psql.floor(pings.LAT.cast('Double')/cell_size_degrees)) # Testing: Find Bay Area Hits # ports_geoHash.filter((psql.col("Y") > 36) & (psql.col("Y") < 39) & (psql.col("X") < -120) & (psql.col("X") > -123)).show() # pings.filter((psql.col("LAT") > 36) & (psql.col("LAT") < 39) & (psql.col("LON") < -120) & (psql.col("LON") > -123)).select("LON", "LAT", "GRID_X", "GRID_Y").take(20) # pingsD = pings.filter(pings.Status == 'moored').limit(250) # for testing # jpings.filter(jpings.PORT_NAME.isNotNull()).count() # for testing # pings = pings.join(psql.broadcast(ports_geoHash.select("PORT_NAME", "POLYGON10KM", "LON_CELL", "LAT_CELL")), on = (pings.GRID_X == ports_geoHash.LON_CELL) & (pings.GRID_Y == ports_geoHash.LAT_CELL), how = 'inner') # broadcast doesn't seem to improve performance
def insert_time_dim(start_date_id, end_date_id): time_begin = datetime.strptime(str(start_date_id), "%Y%m%d").date() time_end = datetime.strptime(str(end_date_id), "%Y%m%d").date() print('time_begin') print(time_begin) print('time_end') print(time_end) # tao dataframe tu time_begin va time_end data = [(time_begin, time_end)] df = spark.createDataFrame(data, ["minDate", "maxDate"]) # convert kieu dl va ten field df = df.select( df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate")) # chay vong lap lay tat ca cac ngay giua mindate va maxdate df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \ .select('date') # convert date thanh cac option ngay_thang_nam df = df.withColumn('id', f.date_format(df.date, "yyyyMMdd")) \ .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \ .withColumn('ngay_trong_tuan', f.from_unixtime(f.unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \ .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \ .withColumn('thang', f.month(df.date)) \ .withColumn('quy', f.quarter(df.date)) \ .withColumn('nam', f.year(df.date)) df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1) data_time = DynamicFrame.fromDF(df, glueContext, 'data_time') # convert data data_time = data_time.resolveChoice(specs=[('tuan_trong_thang', 'cast:int')]) # chon cac truong va kieu du lieu day vao db applymapping1 = ApplyMapping.apply( frame=data_time, mappings=[("id", "string", "id", "bigint"), ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'), ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"), ("tuan_trong_thang", "int", "tuan_trong_thang", "int"), ("tuan_trong_nam", "int", "tuan_trong_nam", "int"), ("thang", "int", "thang", "int"), ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"), ("date", "date", "ngay", "timestamp")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi dl vao db preactions = 'delete student.time_dim where id >= ' + str(start_date_id) datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "preactions": preactions, "dbtable": "student.time_dim", "database": "student_native_report" }, redshift_tmp_dir= "s3n://dts-odin/temp/tu-student_native_report/student/time_dim", transformation_ctx="datasink4")
def test_posexplode_litarray(data_gen): array_lit = gen_scalar( ArrayGen(data_gen, min_length=3, max_length=3, nullable=False)) assert_gpu_and_cpu_are_equal_collect( lambda spark: four_op_df(spark, data_gen).select( f.col('a'), f.col('b'), f.col('c'), f.posexplode(array_lit)))
def get_urls_exploded_by_split_urls(self, split_urls: DataFrame) -> DataFrame: return split_urls \ .select("id", "url", posexplode(split_urls.split_url)) \ .withColumnRenamed("col", "word")