def variants_from_tped (tped): """ Given a tped file in a data frame, extracts the variant information to match the VCF format above with applicable columns (CHR, ID, POS), along with MAP and an array of available alleles. Uses pandas UDFs to convert the splitdata array to get the non-'0' alleles present """ # Define the UDFs locally here. Since I'm working on arrays, the best way I found to do this is to use the Pandas.Series apply function def pandas_get_alleles_present (x): return(list(frozenset(x[4:]) - frozenset(['0']))) tped_get_alleles_present = f.pandas_udf(lambda x: x.apply(pandas_get_alleles_present), ArrayType(StringType())) # Compute missingness directly from the allele array def pandas_get_missing (x): count = 0 for i in range(4, len(x), 2): if (x[i] == '0') | (x[i+1] == '0'): count += 1 return(count) tped_get_missing = f.pandas_udf(lambda x: x.apply(pandas_get_missing), IntegerType()) # Split the tped file splitdata = tped.select("filename",f.split(tped.data,"[\t ]+").alias("split_data"),tped.lineid.alias("VAR_IDX")) # Pull out the first four columns with appropriate casts, and get frac_missing and alleles_present from the UDFs above with_alleles = splitdata.select("filename","VAR_IDX", \ f.element_at(splitdata.split_data,1).alias("CHR"), \ f.element_at(splitdata.split_data,2).alias("ID"), \ f.element_at(splitdata.split_data,3).cast(FloatType()).alias("MAP"), \ f.element_at(splitdata.split_data,4).cast(IntegerType()).alias("POS"), \ ((f.size(splitdata.split_data) - 4) / 2).alias("n_samples"), \ tped_get_alleles_present("split_data").alias("alleles_present"), \ tped_get_missing("split_data").cast(FloatType()).alias("missingcnt")) return(with_alleles)
def main(input_dir,output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('title', types.StringType()), types.StructField('created_utc_iso', types.DateType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())) ]) headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1) ).withColumn( 'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2) ).cache() for year_int in range(2008,2020): print('Plotting for '+str(year_int)) headlines_year = split_sentiment_df.where( functions.year(split_sentiment_df['created_utc_iso']) == year_int ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso'])) headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg( functions.collect_set(headlines_year['title_clean']).alias('titles_group') ) headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') ) string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot) wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
def _get_base_cols(row: StructExpression) -> List[Column]: assert check_argument_types() contig_name_col = fx.col("`locus.contig`").alias("contigName") start_col = (fx.col("`locus.position`") - 1).cast("long").alias("start") end_col = start_col + fx.length(fx.element_at("alleles", 1)) has_info = 'info' in row and isinstance(row.info.dtype, tstruct) if has_info and 'END' in row.info and row.info.END.dtype == tint: end_col = fx.coalesce(fx.col("`info.END`"), end_col) end_col = end_col.cast("long").alias("end") names_elems = [] if 'varid' in row and row.varid.dtype == tstr: names_elems.append("varid") if 'rsid' in row and row.rsid.dtype == tstr: names_elems.append("rsid") names_col = fx.expr( f"nullif(filter(array({','.join(names_elems)}), n -> isnotnull(n)), array())").alias("names") reference_allele_col = fx.element_at("alleles", 1).alias("referenceAllele") alternate_alleles_col = fx.expr("slice(alleles, 2, size(alleles) - 1)").alias("alternateAlleles") base_cols = [ contig_name_col, start_col, end_col, names_col, reference_allele_col, alternate_alleles_col ] assert check_return_type(base_cols) return base_cols
def test_array_element_at(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select( element_at(col('a'), 1), element_at(col('a'), -1)), conf={ 'spark.sql.ansi.enabled': False, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True })
def driver_accumulator_updates(df): return df.where( df.Event == 'org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates' ).select("executionId", F.explode("accumUpdates").alias("updates")).select( "executionId", F.element_at(F.col("updates"), 1).alias("accumulator"), F.element_at(F.col("updates"), 2).alias("value"))
def process_adverse_events(adverse_events: str) -> DataFrame: """ Loads and processes the adverse events input TSV. Ex. input record: biologicalSystem | gastrointestinal effect | activation_general efoId | EFO_0009836 ensemblId | ENSG00000133019 pmid | 23197038 ref | Bowes et al. (2012) symptom | bronchoconstriction target | CHRM3 uberonCode | UBERON_0005409 url | null Ex. output record: id | ENSG00000133019 event | bronchoconstriction datasource | Bowes et al. (2012) eventId | EFO_0009836 literature | 23197038 url | null biosample | {gastrointestinal, UBERON_0005409, null, null, null} effects | [{activation, general}] """ ae_df = (spark.read.csv(adverse_events, sep='\t', header=True).select( F.col('ensemblId').alias('id'), F.col('symptom').alias('event'), F.col('efoId').alias('eventId'), F.col('ref').alias('datasource'), F.col('pmid').alias('literature'), 'url', F.struct( F.col('biologicalSystem').alias('tissueLabel'), F.col('uberonCode').alias('tissueId'), F.lit(None).alias('cellLabel'), F.lit(None).alias('cellFormat'), F.lit(None).alias('cellId'), ).alias('biosample'), F.split(F.col('effect'), '_').alias('effects'), ).withColumn( 'effects', F.struct( F.element_at(F.col('effects'), 1).alias('direction'), F.element_at(F.col('effects'), 2).alias('dosing')), )) # Multiple dosing effects need to be grouped in the same record. effects_df = ae_df.groupBy('id', 'event', 'datasource').agg( F.collect_set(F.col("effects")).alias("effects")) ae_df = ae_df.drop("effects").join(effects_df, on=["id", "event", "datasource"], how="left") return ae_df
def select_type2(df, *type2): """Select fields for log format; 155138 :param df: Input DataFrame :param type2: A list of shopping_sites_id :return: Output DataFrame """ stage_df = ( df.filter(df.logtype.isin('view') & df.info.siteseq.isin(*type2)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'og:url', 'og:title').alias( 'productCode', 'productName')).withColumnRenamed( 'info.siteseq', 'siteseq')) stage_df = stage_df.withColumn('productCode', split(stage_df['productCode'], '/')) return (df.filter( df.logtype.isin('login', 'purchase', 'cart') & df.info.siteseq.isin(*type2)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'productCode', 'productName').alias( 'productCode', 'productName')).withColumnRenamed( 'info.siteseq', 'siteseq').unionAll((stage_df.select( 'maid', 'siteseq', 'userid', 'timestamp', 'logtype', element_at(stage_df.productCode, -1).alias('productCode'), 'productName'))))
def test_array_element_at_ansi_fail(data_gen): assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).select( element_at(col('a'), 100)).collect(), conf={ 'spark.sql.ansi.enabled': True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True }, error_message='java.lang.ArrayIndexOutOfBoundsException')
def gts_from_impute (infile): # Get the main data and put a unique index on each variant maindata = infile.filter(infile.data[0:1] != "#") splitdata = maindata.select("filename",f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX")) gtdata1 = splitdata.select("filename", "VAR_IDX", f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","COLUMN_IDX","GTPROB").filter("COLUMN_IDX > 4") # Now, get subject ID and which GT gtdata2 = gtdata1.select("filename", "VAR_IDX", "GTPROB", "COLUMN_IDX", f.floor((gtdata1.COLUMN_IDX - 5) / 3).alias("SAMPLE_IDX"), ((gtdata1.COLUMN_IDX - 5) % 3).cast(StringType()).alias("GT_IDX")) gtdata3 = rkutil.withColumnsRenamed(gtdata2.groupBy("filename","VAR_IDX","SAMPLE_IDX").pivot("GT_IDX",["0","1","2"]).agg(f.collect_list("GTPROB")), ["0","1","2"],["c0","c1","c2"]) gtdata4 = gtdata3.select("filename","VAR_IDX","SAMPLE_IDX", f.element_at(gtdata3.c0, 1).cast(FloatType()).alias("P11"), f.element_at(gtdata3.c1, 1).cast(FloatType()).alias("P12"), f.element_at(gtdata3.c2, 1).cast(FloatType()).alias("P22")) return(gtdata4)
def test_array_element_at_zero_index_fail(index, ansi_enabled): message = "SQL array indices start at 1" if isinstance(index, int): test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select( element_at(col('a'), index)).collect() else: test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr( 'element_at(a, b)').collect() assert_gpu_and_cpu_error( test_func, conf={'spark.sql.ansi.enabled':ansi_enabled}, error_message=message)
def top10_trends(data): result = {"videos": []} videos = data.groupBy(["video_id", "title", "description"]) \ .agg(collect_list(array('trending_date', "views", "likes", "dislikes")) \ .alias("trending_days"), countDistinct("trending_date")) videos = videos.orderBy("count(trending_date)", ascending=False) videos = videos.withColumn("latest_views", element_at("trending_days", -1)[1]) \ .withColumn("latest_likes", element_at("trending_days", -1)[2]) \ .withColumn("latest_dislikes", element_at("trending_days", -1)[3]) for row in videos.rdd.collect(): result["videos"].append({ "id": row["video_id"], "title": row["title"], "description": row["description"], "latest_views": row["latest_views"], "trending_days": row["trending_days"] }) result["videos"] = result["videos"][:10] return result
def joinDataSet(): spark = SparkSession.builder.appName('csv_parse').getOrCreate() #Load xml xml_df = spark.read.format('com.databricks.spark.xml'). \ option("rootTag", "feed"). \ option("rowTag","doc"). \ load(xml_s3_path). \ withColumn("title",f.ltrim(f.split(f.col("title"),":").getItem(1))). \ withColumn("shortUrl",f.split(f.col("url"),"/")) selectedData = xml_df.select( "title", "url", f.element_at(f.col('shortUrl'), -1).alias('shortUrl'), "abstract") selectedData.repartition(1).write.option( "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_2, header='false') selectedData.createOrReplaceTempView("wiki_pages") #Load csv json_schema = ArrayType( StructType([ StructField('name', StringType(), nullable=False), StructField('id', IntegerType(), nullable=False) ])) df = spark.read.option("header",True). \ option("quote","\""). \ option("escape","\""). \ option("multiLine",True). \ csv(csv_s3_path). \ withColumn("sanitizedTitle",f.regexp_replace(f.col("title"),"\\s+","_")). \ withColumn("year",f.split(f.col("release_date"),"-").getItem(0)). \ withColumn("companiesList",f.from_json(f.col("production_companies"),json_schema)). \ withColumn("companiesList",f.concat_ws("-",f.col("companiesList.name"))) csvSelectedData = df.select("title", "sanitizedTitle") csvSelectedData.repartition(1).write.option( "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_3, header='false') df.createOrReplaceTempView("movies_metadata") # Join datasets q = spark.sql(join_sql_query) # Write output to s3 q.repartition(1).write.option( "sep", "\t").format('csv').mode("overwrite").save(csv_output_path, header='false')
def test_array_element_at_ansi_fail_invalid_index(index): message = "ArrayIndexOutOfBoundsException" if is_before_spark_330() else "SparkArrayIndexOutOfBoundsException" if isinstance(index, int): test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select( element_at(col('a'), index)).collect() else: test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr( 'element_at(a, b)').collect() # For 3.3.0+ strictIndexOperator should not affect element_at test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'}) assert_gpu_and_cpu_error( test_func, conf=test_conf, error_message=message)
def simple_additive_gt(data): df = data.select("*", element_at(split(data.RAWGT, "\s+"), 1).alias("a1"), element_at(split(data.RAWGT, "\s+"), 2).alias("a2")) df = df.select("*", (df.a1 == df.REF).alias("a1R"), (df.a2 == df.REF).alias("a2R"), (df.a1 == df.ALT).alias("a1A"), (df.a2 == df.ALT).alias("a2A")) # Now, filter out those for which a1 isn't REF or ALT or a2 isn't REF or ALT df = df.drop("a1", "a2").withColumn("a1", df.a1A | df.a1R).withColumn( "a2", df.a2A | df.a2R) df.filter((df.a1 == False) | (df.a2 == False)).select( "VAR_IDX", "RAWGT", "REF", "ALT").distinct().show() # And make the additive df = df.filter(df.a1 == True).filter( df.a2 == True).filter("RAWGT != '0 0'").withColumn( "GT_ADD", df.a1A.cast(IntegerType()) + df.a2A.cast(IntegerType())).drop( "a1A", "a2A", "a1R", "a2R", "a1", "a2", "REF", "ALT") return (df)
def assoc_fn(df: DataFrame, group_by_cols): gbc = [col(x) for x in group_by_cols] h_fn = partial(harmonic_fn, partition_cols=group_by_cols, over_col="evs_score", output_col=harmonic_col) assoc_df = (df.withColumn( "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0))) ).transform(h_fn).groupBy(*gbc).agg( countDistinct(col("pmid")).alias("f"), mean(col("evidence_score")).alias("mean"), stddev(col("evidence_score")).alias("std"), max(col("evidence_score")).alias("max"), min(col("evidence_score")).alias("min"), expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))"). alias("q"), count(col("pmid")).alias("N"), first(col(harmonic_col)).alias(harmonic_col)).withColumn( "median", element_at(col("q"), 2)).withColumn( "q1", element_at(col("q"), 1)).withColumn("q3", element_at(col("q"), 3)).drop("q")) return assoc_df
def variants_from_vcf (vcf): """ Given a VCF file in a data frame, extract the first 9 variant columns and give them unique identifiers. Include genotype columns as an array parsed out with a pandas udf """ # Get the main data and put a unique index on each variant maindata = vcf.filter(vcf.data.startswith('#') == False) splitdata = maindata.select("filename",f.split(f.substring_index('data',"[\t ]+",9),"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX")) # Now pull out the columns one at a time, casting non-strings to appropriate type. Split out INFO and FORMAT here variant = splitdata.select("filename","VAR_IDX",\ f.element_at(splitdata.split_data,1).alias("CHR"),\ f.element_at(splitdata.split_data,2).cast(IntegerType()).alias("POS"),\ f.element_at(splitdata.split_data,3).alias("ID"),\ f.element_at(splitdata.split_data,4).alias("REF"),\ f.element_at(splitdata.split_data,5).alias("ALT"),\ f.element_at(splitdata.split_data,6).cast(FloatType()).alias("QUAL"),\ f.element_at(splitdata.split_data,7).alias("FILTER"),\ f.split(f.element_at(splitdata.split_data,8), ";").alias("INFO"),\ f.split(f.element_at(splitdata.split_data,9), ":").alias("FORMAT")) return(variant)
def variants_from_impute (infile): # First map the filename to CHR chrs = dict() for filenamerow in infile.select("filename").distinct().collect(): s = filenamerow.filename i = s.find("chr") if (i>0): start = i+3 stop = (i+3)+s[start:].find(".") cur_chr = s[start:stop] if cur_chr == 'X': cur_chr = '23' else: cur_chr = 'ND' chrs[s] = cur_chr # Get the main data and put a unique index on each variant. Add in the CHR here. maindata = infile.filter(infile.data[0:1] != "#") splitdata = maindata.select(maindata.filename, \ maindata.lineid.alias("VAR_IDX"), \ maindata.data, \ f.split(maindata.data,"[\t ]+").alias("split_data"), \ maindata.filename.alias("CHR")).replace(chrs, subset="CHR") # Now pull out the first five columns one at a time, casting non-strings to appropriate type. variant = splitdata.select("filename","VAR_IDX","data","CHR",\ f.element_at(splitdata.split_data,1).alias("COL1ID"),\ f.element_at(splitdata.split_data,2).alias("RAWID"),\ f.element_at(splitdata.split_data,3).cast(IntegerType()).alias("POS"),\ f.element_at(splitdata.split_data,4).alias("ALLELE1"),\ f.element_at(splitdata.split_data,5).alias("ALLELE2")) # Next, get the rsID, if present in the RAWID column variant2 = variant.select("*", f.split(variant.RAWID, ":").alias("split_id")) variant3 = variant2.select("*", f.element_at(variant2.split_id, 1).alias("EXTRACTID")) return(variant3)
def main(desc_file, evid_file, cell_file, out_file): sparkConf = (SparkConf().set('spark.driver.memory', '15g').set( 'spark.executor.memory', '15g').set('spark.driver.maxResultSize', '0').set('spark.debug.maxToStringFields', '2000').set( 'spark.sql.execution.arrow.maxRecordsPerBatch', '500000')) spark = (SparkSession.builder.config( conf=sparkConf).master('local[*]').getOrCreate()) # Log parameters: logging.info(f'Evidence file: {evid_file}') logging.info(f'Description file: {desc_file}') logging.info(f'Cell type annotation: {cell_file}') logging.info(f'Output file: {out_file}') # Read files: evidence_df = (spark.read.csv(evid_file, sep='\t', header=True).drop('pmid', 'gene_set_name', 'disease_name')) cell_lines_df = spark.read.csv(cell_file, sep='\t', header=True) description_df = spark.read.csv(desc_file, sep='\t', header=True) # Logging dataframe stats: logging.info(f'Number of evidence: {evidence_df.count()}') logging.info(f'Number of descriptions: {description_df.count()}') logging.info(f'Number of cell/tissue annotation: {cell_lines_df.count()}') # Tissues and cancer types are annotated together in the same column (tissue_or_cancer_type) # To disambiguate one from another, the column is combined with the cell lines # First on the tissue level: tissue_desc = (description_df.withColumnRenamed( 'tissue_or_cancer_type', 'tissue').join(cell_lines_df, on='tissue', how='inner')) # And then on the disease level: cell_desc = (description_df.withColumnRenamed('tissue_or_cancer_type', 'diseaseFromSource').join( cell_lines_df, on='diseaseFromSource', how='inner')) merged_annotation = ( # Concatenating the above generated dataframes: cell_desc.union(tissue_desc) # Aggregating by disease and method: .groupBy('diseaseFromSource', 'efo_id', 'method') # The cell annotation is aggregated in a list of struct: .agg( collect_set( struct(col('name'), col('id'), col('tissue'), col('tissueId'))).alias('diseaseCellLines') ).drop('method')) # Joining merged annotation with evidence: pooled_evidence_df = ( evidence_df.select( col('target_id').alias('targetFromSourceId'), col('disease_id').alias('efo_id'), col('score').alias('resourceScore').cast(FloatType()), ) # Some of the target identifier are not Ensembl Gene id - replace them: .replace(to_replace=CRISPR_SYMBOL_MAPPING, subset=['targetFromSourceId']) # Merging with descriptions: .join(merged_annotation, on='efo_id', how='outer') # From EFO uri, generate EFO id: .withColumn( 'diseaseFromSourceMappedId', element_at(split(col('efo_id'), '/'), -1).alias('diseaseFromSourceMappedId')).drop('efo_id') # Adding constants: .withColumn('datasourceId', lit('crispr')).withColumn( 'datatypeId', lit('affected_pathway')).persist()) logging.info( f'Saving {pooled_evidence_df.count()} CRISPR evidence in JSON format, to: {out_file}' ) write_evidence_strings(pooled_evidence_df, out_file)
def debug_augmentation(df): return (df.select("customerID").distinct().select( "customerID", F.substring("customerID", 0, 10).alias("originalID"), F.element_at(F.split("customerID", "-", -1), 3).alias("suffix"), ))
key_data = spark.sql("SELECT `key` FROM logging_demp.key_table") ## Create some the parent child pairs we need to create out structure def zip_pairs(value): lead_list = value.copy() lead_list.pop() lead_list.insert(0,None) result = [item for item in zip(lead_list,value)] return result pairZip = udf(zip_pairs, ArrayType(ArrayType(StringType())) ) df2 = key_data.select("key").withColumn("key_split", split(col("key"), "/")) \ .withColumn("depth", size(col("key_split"))) \ .withColumn("file", element_at(col("key_split"), -1) ) \ .withColumn("pairs", pairZip(col("key_split"))) ## Schema to create. ## prefixes all refer to a file. ## a file is the list thing in the prefix (-1 index in the python list once we split) ## All other bits are parents. ## two types ## "Folder" / File / "table" which is group of files ## tables can have partitions too which maybe important ## Note Folder doesn't matter for perf testing but is used for permission models and end user exploration ## Folder has parent attribute ## File too but File can change depending on repacking and drop table / append / repartition write
def main(input_dir, output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())), types.StructField('score', types.LongType()), types.StructField('num_comments', types.LongType()), ]) headlines_df = spark.read.json(input_dir, encoding='utf-8', schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'], 1)).withColumn( 'subjectivity', functions.element_at( headlines_df['polarity_subjectivity'], 2)) df_sentiment = split_sentiment_df.withColumn( 'label', get_label(split_sentiment_df['polarity'])) training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25]) headline_vector_size = 3 word_freq_vector_size = 100 tokenizer = Tokenizer(inputCol='title_clean', outputCol='words') headline2Vector = Word2Vec(vectorSize=headline_vector_size, minCount=0, inputCol='words', outputCol='headline_vector') hashingTF = HashingTF(inputCol='words', outputCol='word_counts', numFeatures=word_freq_vector_size) idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5) headline_vector_size_hint = VectorSizeHint( inputCol='headline_vector', size=headline_vector_size) #need this for streaming word_freq_vector_size_hint = VectorSizeHint( inputCol='word_frequecy', size=word_freq_vector_size) #need this for streaming feature_assembler = VectorAssembler(inputCols=[ 'headline_vector', 'score', 'num_comments', 'subjectivity', 'word_frequecy' ], outputCol='features') dt_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=9) pipeline = Pipeline(stages=[ tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint, word_freq_vector_size_hint, feature_assembler, dt_classifier ]) sentiment_model = pipeline.fit(training_set) validation_predictions = sentiment_model.transform(validation_set) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label') validation_score = evaluator.evaluate(validation_predictions) print('Validation score for Sentiment model F1: %g' % (validation_score, )) validation_score_accuracy = evaluator.evaluate( validation_predictions, {evaluator.metricName: "accuracy"}) print('Validation score for Sentiment model Accuracy: %g' % (validation_score_accuracy, )) sentiment_model.write().overwrite().save(output_dir)
def get_department(the_col: Union[str, f.Column]) -> f.Column: _the_col = the_col if isinstance(the_col, f.Column) else f.col(the_col) return f.element_at(f.split(_the_col, '_'), 1)
def process_biomarkers( self, biomarkers_df: DataFrame, source_df: DataFrame, disease_df: DataFrame, drugs_df: DataFrame ) -> DataFrame: """The diverse steps to prepare and enrich the input table""" biomarkers_enriched = ( biomarkers_df .select( 'Biomarker', 'IndividualMutation', array_distinct(split(col('Alteration'), ';')).alias('alterations'), array_distinct(split(col('Gene'), ';')).alias('gene'), split(col('AlterationType'), ';').alias('alteration_types'), array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'), array_distinct(split(col('Drug'), ';|,')).alias('drug'), 'DrugFullName', 'Association', 'gDNA', array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'), array_distinct(split(col('Source'), ';')).alias('source') ) .withColumn('confidence', explode(col('confidence'))) .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name'))) .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', '')) .withColumn('drug', explode(col('drug'))) .withColumn('drug', translate(col('drug'), '[]', '')) .withColumn('gene', explode(col('gene'))) .replace(to_replace=GENENAMESOVERRIDE, subset=['gene']) .withColumn('gene', upper(col('gene'))) # At this stage alterations and alteration_types are both arrays # Disambiguation when the biomarker consists of multiple alterations is needed # This is solved by: # 1. Zipping both fields - tmp consists of a list of alteration/type tuples # 2. tmp is exploded - tmp consists of the alteration/type tuple # 3. alteration & alteration_type columns are overwritten with the elements in the tuple .withColumn( 'tmp', self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types'))) .withColumn('tmp', explode(col('tmp'))) .withColumn('alteration_type', element_at(col('tmp'), 2)) .withColumn( 'alteration', when( ~col('IndividualMutation').isNull(), col('IndividualMutation') ) .otherwise(element_at(col('tmp'), 1)) ) .drop('tmp') # Clean special cases on the alteration string .withColumn( 'alteration', when( col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.', col('Biomarker') # 'NRAS (12,13,59,61,117,146)' ) .when( # Cleans strings like 'ARAF:.' col('alteration').contains(':.'), translate(col('alteration'), ':.', '') ) .when( # Fusion genes are described with '__' # biomarker is a cleaner representation when there's one alteration (col('alteration').contains('__')) & (~col('Biomarker').contains('+')), col('Biomarker') ) .otherwise(col('alteration')) ) # Split source into literature and urls # literature contains PMIDs # urls are enriched from the source table if not a CT .withColumn('source', explode(col('source'))) .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source'))) .join(source_df, on='source', how='left') .withColumn( 'literature', when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2)) ) .withColumn( 'urls', when( col('source').startswith('NCT'), struct( lit('Clinical Trials').alias('niceName'), concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url') ) ) .when( (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')), struct(col('niceName'), col('url')) ) ) # The previous conditional clause creates a struct regardless of # whether any condition is met. The empty struct is replaced with null .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls'))) # Enrich data .withColumn('functionalConsequenceId', col('alteration_type')) .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId']) .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association']) .join(disease_df, on='tumor_type', how='left') .withColumn('drug', upper(col('drug'))) .withColumn( # drug class is coalesced when the precise name of the medicine is not provided 'drug', when(col('drug') == '', col('DrugFullName')).otherwise(col('drug'))) .join(drugs_df, on='drug', how='left') .withColumn('drug', initcap(col('drug'))) # Translate variantId .withColumn( 'variantId', when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA'))) ) # Assign a GO ID when a gene expression data is reported .withColumn( 'geneExpressionId', when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')), 'GO_0010628' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')), 'GO_0010629' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')), 'GO_0010467' ) ) # Create variant struct .withColumn( 'variant', when( col('alteration_type') != 'EXPR', struct( col('alteration').alias('name'), col('variantId').alias('id'), col('functionalConsequenceId') ) ) ) # Create geneExpression struct .withColumn( 'geneExpression', when( col('alteration_type') == 'EXPR', struct( col('alteration').alias('name'), col('geneExpressionId').alias('id')) ) ) ) pre_evidence = ( biomarkers_enriched .withColumn('datasourceId', lit('cancer_biomarkers')) .withColumn('datatypeId', lit('affected_pathway')) .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource') .withColumnRenamed('drug', 'drugFromSource') # diseaseFromSourceMappedId, drugId populated above .withColumnRenamed('Association', 'drugResponse') # confidence, literature and urls populated above .withColumnRenamed('gene', 'targetFromSourceId') .withColumnRenamed('Biomarker', 'biomarkerName') # variant, geneExpression populated above .drop( 'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId', 'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url') ) # Group evidence self.evidence = ( pre_evidence .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId', 'drugResponse', 'targetFromSourceId', 'diseaseFromSource', 'diseaseFromSourceMappedId', 'confidence', 'biomarkerName') .agg( collect_set('literature').alias('literature'), collect_set('urls').alias('urls'), collect_set('variant').alias('variant'), collect_set('geneExpression').alias('geneExpression'), ) # Replace empty lists with null values .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature'))) .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls'))) .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant'))) .withColumn( 'geneExpression', when(size(col('geneExpression')) == 0, lit(None)) .otherwise(col('geneExpression'))) # Collect variant info into biomarkers struct .withColumn( 'biomarkers', struct( 'variant', 'geneExpression' )) .drop('variant', 'geneExpression') .distinct() ) return self.evidence
def main(spark): path = '../../../../data/census/' filename = "PEP_2017_PEPANNRES.csv" absolute_file_path = get_absolute_file_path(path, filename) # Ingestion of the census data census_df = spark.read.format("csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .option("encoding", "cp1252") \ .load(absolute_file_path) census_df = census_df.drop("GEO.id") \ .drop("rescen42010") \ .drop("resbase42010") \ .drop("respop72010") \ .drop("respop72011") \ .drop("respop72012") \ .drop("respop72013") \ .drop("respop72014") \ .drop("respop72015") \ .drop("respop72016") \ .withColumnRenamed("respop72017", "pop2017") \ .withColumnRenamed("GEO.id2", "countyId") \ .withColumnRenamed("GEO.display-label", "county") logging.warning("Census data") census_df.sample(0.1).show(3, False) census_df.printSchema() path = '../../../../data/dapip/' filename = "InstitutionCampus.csv" absolute_file_path = get_absolute_file_path(path, filename) # Higher education institution (and yes, there is an Arkansas College # of Barbering and Hair Design) higher_ed_df = spark.read.format("csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .load(absolute_file_path) higher_ed_df = higher_ed_df \ .filter("LocationType = 'Institution'") \ .withColumn("addressElements", F.split(F.col("Address"), " ")) higher_ed_df = higher_ed_df.withColumn("addressElementCount", F.size(F.col("addressElements"))) higher_ed_df = higher_ed_df.withColumn( "zip9", F.element_at(F.col("addressElements"), F.col("addressElementCount"))) higher_ed_df = higher_ed_df.withColumn("splitZipCode", F.split(F.col("zip9"), "-")) higher_ed_df = higher_ed_df \ .withColumn("zip", F.col("splitZipCode")[0]) \ .withColumnRenamed("LocationName", "location") \ .drop("DapipId") \ .drop("OpeId") \ .drop("ParentName") \ .drop("ParentDapipId") \ .drop("LocationType") \ .drop("Address") \ .drop("GeneralPhone") \ .drop("AdminName") \ .drop("AdminPhone") \ .drop("AdminEmail") \ .drop("Fax") \ .drop("UpdateDate") \ .drop("zip9") \ .drop("addressElements") \ .drop("addressElementCount") \ .drop("splitZipCode") \ .alias("highered") logging.warning("Higher education institutions (DAPIP)") higher_ed_df.sample(0.1).show(3, False) higher_ed_df.printSchema() path = '../../../../data/hud/' filename = "COUNTY_ZIP_092018.csv" absolute_file_path = get_absolute_file_path(path, filename) # Zip to county county_zip_df = spark.read.format("csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .load(absolute_file_path) county_zip_df = county_zip_df \ .drop("res_ratio") \ .drop("bus_ratio") \ .drop("oth_ratio") \ .drop("tot_ratio") \ .alias("hud") logging.warning("Counties / ZIP Codes (HUD)") county_zip_df.sample(0.1) \ .show(3, False) county_zip_df.printSchema() # Institutions per county id instit_per_county_df = higher_ed_df.join( county_zip_df, higher_ed_df["zip"] == county_zip_df["zip"], "inner") logging.warning("Higher education institutions left-joined with HUD") instit_per_county_df.filter(higher_ed_df["zip"] == 27517) \ .show(20, False) instit_per_county_df.printSchema() # Institutions per county name instit_per_county_df = instit_per_county_df.join( census_df, instit_per_county_df["county"] == census_df["countyId"], "left") logging.warning("Higher education institutions and county id with census") instit_per_county_df.filter(higher_ed_df["zip"] == 27517) \ .show(20, False) instit_per_county_df.filter(higher_ed_df["zip"] == 2138) \ .show(20, False) # Final clean up instit_per_county_df = instit_per_county_df.drop("highered.zip") \ .drop("hud.county") \ .drop("countyId") \ .distinct() logging.warning("Final list") instit_per_county_df.show(200, False) logging.warning("The combined list has {} elements.".format( instit_per_county_df.count()))
chains.persist() #%% chains = chains.join(data, chains.next == data.tweet_id, 'inner')\ .select( 'sender', 'tweets', data.text.alias('response'), data.author_id, data.response_tweet_id.alias('next'), ) #%% chains = chains.withColumn( 'self_response', functions.element_at('sender', -1) == functions.col('author_id')) #%% chains.persist() #Add to samples samples = samples.unionAll( chains.filter(~chains['self_response']).select('sender', 'tweets', 'response', 'author_id')) # Remove finished chains samples = samples.checkpoint() #%% chains = chains.filter(chains.next != 'none') chains.persist()
def main(input_dir,output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('created_utc_iso', types.DateType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())), types.StructField('score',types.LongType()) ]) headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80).cache() agg_scores = headlines_df.groupBy(headlines_df['created_utc_iso']).agg( functions.collect_set(headlines_df['score']).alias('scores_per_day') ) top_scores_df = agg_scores.withColumn('sorted_scores',top_scores(agg_scores['scores_per_day'])) #getting top headlines with the highest scores top_headlines = headlines_df.join(functions.broadcast(top_scores_df),on=['created_utc_iso']).where( arr_contains(top_scores_df['sorted_scores'],headlines_df['score']) ).select( headlines_df['title_clean'], headlines_df['created_utc_iso'], headlines_df['polarity_subjectivity'], headlines_df['score'] ).withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1) ).withColumn( 'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2) ) agg_sentiment_by_day = top_headlines.groupBy(top_headlines['created_utc_iso']).agg( functions.avg(top_headlines['polarity']).alias('avg_sentiment') ).cache() assembler = VectorAssembler(inputCols=['avg_sentiment'], outputCol='features') scaler = MinMaxScaler(inputCol='features',outputCol='normalized_avg_vector') pipeline = Pipeline(stages=[assembler, scaler]) scaler_model = pipeline.fit(agg_sentiment_by_day) scaled_avg = scaler_model.transform(agg_sentiment_by_day) scaled_avg = scaled_avg.withColumn('normalized_avg',first_element(scaled_avg['normalized_avg_vector'])) #save scaled_avg to file need to coelesce aggregates into 1 file #because this will be read by pandas later on which doesnt support multi file #scaled_avg.select( #scaled_avg['created_utc_iso'].alias('date'), #scaled_avg['normalized_avg'].alias('avg_sentiment_top_news') #).coalesce(1).write.csv(output_dir, mode='overwrite', compression='gzip') #this wil always be each day of year since from 2008 to 2019 #all data is aggregated into 365 days * 11 years = at around 4000 records aggregate_pandas = scaled_avg.select( scaled_avg['created_utc_iso'].alias('date'), scaled_avg['normalized_avg'].alias('sentiment') ).toPandas() aggregate_pandas = aggregate_pandas.set_index('date'); plt.plot(aggregate_pandas['sentiment'],marker='.',alpha=0.5,linestyle='None') plt.savefig(output_dir+ '/sentiment_series.png')
def compile_array_index(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) index = op.index.op().value + 1 return F.element_at(src_column, index)
def task_3(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: asin_column = 'asin' price_column = 'price' attribute = 'also_viewed' related_column = 'related' # Outputs: meanPriceAlsoViewed_column = 'meanPriceAlsoViewed' countAlsoViewed_column = 'countAlsoViewed' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ # countAlsoViewed added_also_viewed = product_data.select( asin_column, price_column, F.element_at(related_column, attribute)) # .withColumnRenamed( # 'element_at(related, also_viewed)', # attribute) added_count = added_also_viewed.withColumn(countAlsoViewed_column, F.size(F.col('element_at(related, also_viewed)')))\ .replace(-1, None) # countAlsoViewed res exploded_df = added_count.select(asin_column, F.explode('element_at(related, also_viewed)'))\ .withColumnRenamed( 'col', 'to_join') joined_df = exploded_df.join(added_count.withColumnRenamed( 'asin', 'to_join'), on='to_join', how='inner') # meanPriceAlsoViewed out_df = joined_df.groupby(asin_column).agg({price_column: 'mean'})\ .withColumnRenamed('avg(price)', meanPriceAlsoViewed_column) out_df = added_count.join(out_df, on=asin_column, how='left') count_total = out_df.count() # meanPriceAlsoViewed res mean_meanPriceAlsoViewed = out_df.select(F.avg( out_df.meanPriceAlsoViewed)).head()[0] variance_meanPriceAlsoViewed = out_df.select( F.variance(out_df.meanPriceAlsoViewed)).head()[0] numNulls_meanPriceAlsoViewed = out_df.filter( out_df.meanPriceAlsoViewed.isNull()).count() mean_countAlsoViewed = out_df.select(F.avg( out_df.countAlsoViewed)).head()[0] variance_countAlsoViewed = out_df.select(F.variance( out_df.countAlsoViewed)).head()[0] numNulls_countAlsoViewed = out_df.filter( out_df.countAlsoViewed.isNull()).count() # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_meanPriceAlsoViewed': None, 'variance_meanPriceAlsoViewed': None, 'numNulls_meanPriceAlsoViewed': None, 'mean_countAlsoViewed': None, 'variance_countAlsoViewed': None, 'numNulls_countAlsoViewed': None } # Modify res: res['count_total'] = int(count_total) res['mean_meanPriceAlsoViewed'] = float(mean_meanPriceAlsoViewed) res['variance_meanPriceAlsoViewed'] = float(variance_meanPriceAlsoViewed) res['numNulls_meanPriceAlsoViewed'] = int(numNulls_meanPriceAlsoViewed) res['mean_countAlsoViewed'] = float(mean_countAlsoViewed) res['variance_countAlsoViewed'] = float(variance_countAlsoViewed) res['numNulls_countAlsoViewed'] = int(numNulls_countAlsoViewed) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_3') return res
def foreach_jdbc_writer(df, epoch_id): df.write.\ jdbc(url="jdbc:mysql://localhost/world",table="amazon_products",mode='append',properties={"driver":"com.mysql.cj.jdbc.Driver","user":"******"}) spark = SparkSession.builder.master('local[2]').appName( 'StreamingDemo').getOrCreate() df = spark.readStream.format('kafka')\ .option('kafka.bootstrap.servers','localhost:9092')\ .option('subscribe','amazon')\ .load() deser = udf(lambda x: pickle.loads(x), MapType(StringType(), StringType())) deserlizedDF = df.withColumn('map', deser(df['value'])) parsedDF = deserlizedDF.withColumn('title',element_at('map','productTitle'))\ .withColumn('Categories',element_at('map','productCategories'))\ .withColumn('Rating',element_at('map','productRating'))\ .withColumn('Description',element_at('map','productDescription'))\ .withColumn('Prices',element_at('map','productPrices'))\ .withColumn('Min_Price',array_min(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))\ .withColumn('Max_Price',array_max(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType())))) projectedDF = parsedDF.select('title', 'Categories', 'Rating', 'Prices', 'Min_Price', 'Max_Price') result = projectedDF.writeStream.foreachBatch(foreach_jdbc_writer).start() result.awaitTermination()
'agg1-street-index', settings=settings, append=False) # In[8]: agg_stop_df = read_elastic("agg1-street-index", array_field="reverse_gecode").withColumnRenamed( 'coordinates', 'agg_coords') # .withColumn('reverse_gecode', F.array_distinct("reverse_gecode")) stop_df = read_elastic('stop-index') reverse_gecode_df = stop_df.join( agg_stop_df, (F.round(F.element_at(stop_df.coordinates, 1), 5) == F.round( F.element_at(agg_stop_df.agg_coords, 1), 5)) & (F.round(F.element_at(stop_df.coordinates, 2), 5) == F.round( F.element_at(agg_stop_df.agg_coords, 2), 5)), how='left').drop('agg_coords') settings = { "settings": { "number_of_shards": 1, "number_of_replicas": 0 }, "mappings": { "properties": { "actualDelay": { "type": "long" },