示例#1
0
def generate_TFIDF(sc, df , sqlcontext):
  
	# 1. calculate the number of rows(documents) in data framework
    t_num = df.count()    
    
	# 2. select _id and lower the text_entry and remove punctuation symbols 
	#and then split it as a list of words('tokens')
    word_spilits = df.select("_id",F.split(F.lower(F.regexp_replace(df.text_entry,'[^\w\s]' ,'')),' ').alias('tokens'))
	
	# 3. explode the list of words to generate a list of _id and token
	#then, group the list base on _id and token to calculate frequency of tokens (tf) in each row
	# to create a  data framework words_tf (_id , token , tf)
    words_tf = word_spilits.select("_id", F.explode(word_spilits.tokens).alias('token'))\	
	.groupBy("_id", "token").agg({'token': 'count'}).withColumnRenamed("count(token)", "tf")
	
	# 4. to calculate frequency of token in document (df), I aggregate the list base on token 
	# and created a set of _ids with duplicate _ids eliminated ('collect_set')
	# and calculated the number of _ids and document frequency of a token
	# to create a data framework words_df (_id , token , df)
    words_df = words_tf.groupby("token").agg(F.collect_set("_id").alias("_ids"))\
	.select("token", F.explode("_ids").alias('_id'), F.size("_ids").alias('df'))
    
	# 5. to calculate the final TFIDF data framework, I joined
	# I joined two data frameworks words_tf and words_df base on same _id and token
	# then calculated the idf by fraction of number of documents (t_num) on document frequency (df)
	# then calculated the tf_idf by multiplying idf and tf
    tokensWithTfIdf = words_tf.join(words_df, (words_tf._id == words_df._id) &  (words_tf.token == words_df.token))\
    .select(words_tf._id , words_tf.token, words_tf.tf , words_df.df,(F.log10(t_num / words_df.df )).alias("idf")\
	, (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") )
    
	# 6. cache the TFIDF data framework for further usage 
    tokensWithTfIdf.cache()
    return tokensWithTfIdf
示例#2
0
def add_computed_cols(entityInfo):
    # add computed variables
    entityInfo = entityInfo.withColumn(
        "log_ncat",
        F.when(entityInfo["ncat"] > 0, F.log10("ncat")).otherwise(0))
    entityInfo = entityInfo.withColumn("log_nCustomers",
                                       F.log10(entityInfo["nCustomers"]))
    entityInfo = entityInfo.withColumn("log_spend",
                                       F.log10(entityInfo["spend"] + 0.01))
    entityInfo = entityInfo.withColumn("log_orders",
                                       F.log10(entityInfo["orders"]))
    return entityInfo
示例#3
0
def createTrans09(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) 
		  .alias("LogUnknownIncomeDebtRatioPerLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberRealEstateLoansOrLines)) 
		  .alias("LogUnknownIncomeDebtRatioPerRealEstateLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimesPastDue)) 
		  .alias("LogUnknownIncomeDebtRatioPerDelinquency")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) 
		  .alias("LogUnknownIncomeDebtRatioPer90DaysLate")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.NumberRealEstateLoansOrLines)) 
		  .alias("LogNumberRealEstateLoansOrLines")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberRealEstateLoansOrLines"
		, when(sparkDFTrans.LogNumberRealEstateLoansOrLines.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberRealEstateLoansOrLines)
	)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberRealEstateLoansOrLines)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimesPastDue)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimes90DaysLate)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.age < 18, 1).otherwise(0) 
		  .alias("LowAge")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.age - 17)) 
		  .alias("Logage")
	)	
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.age)	
	return sparkDFTrans
示例#4
0
def tf_idf(df, n):
    # Extracting terms per each row/document as a list
    temp_df = df.withColumn(
        'terms',
        f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')),
                ' '))

    # Calculating total number of words per row/document
    temp_df1 = temp_df.withColumn('total_num_words', f.size('terms'))

    # Extracting words in each documents
    temp_df2 = temp_df1.withColumn('token', f.explode('terms'))

    # Calculating tf
    temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({
        'token':
        'count'
    }).withColumnRenamed('count(token)', 'occurrence').sort('_id')
    temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence)

    # Calculating df
    temp_df5 = temp_df4.groupBy('token').agg(
        f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df')

    # Calculating idf
    temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df))

    # Calculating tf-idf
    joined_df = temp_df4.join(temp_df6,
                              temp_df4.token == temp_df6.token).select(
                                  temp_df4.token, temp_df4._id, temp_df4.tf,
                                  temp_df6.df, temp_df6.idf)
    result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf)

    return result
示例#5
0
def get_features_and_labels(transactions_df, transactions_id_cols,
                            transactions_cat_cols):
    # Get features
    non_feature_cols = ['isFraud', 'TransactionDT'
                        ] + transactions_id_cols.split(",")
    feature_cols = [
        col for col in transactions_df.columns if col not in non_feature_cols
    ]
    logger.info(f'transactions_df columns: {transactions_df}')
    logger.info(f'transactions_id_cols columns: {transactions_id_cols}')
    logger.info(f'Feature columns: {feature_cols}')
    logger.info("Categorical columns: {}".format(
        transactions_cat_cols.split(",")))
    features = transactions_df.select(feature_cols)

    kdf_features = features.to_koalas()
    kdf_features = ks.get_dummies(
        kdf_features, columns=transactions_cat_cols.split(",")).fillna(0)

    features = kdf_features.to_spark()
    features = features.withColumn('TransactionAmt',
                                   fc.log10(fc.col('TransactionAmt')))
    logger.info("Transformed feature columns: {}".format(list(
        features.columns)))
    logger.info("Transformed feature count: {}".format(features.count()))
    # Get labels
    labels = transactions_df.select('TransactionID', 'isFraud')
    logger.info("Transformed label columns: {}".format(list(labels.columns)))
    logger.info("Shape of label: {}".format(labels.count()))

    return features, labels
def apply_tracklet_cuts(df: DataFrame) -> DataFrame:
    """ Select potential tracklet candidates based on property cuts.

    We first apply 3 criteria to select interesting candidates:

    1. remove alerts with possible counterpart in MPC
    2. remove alerts with negative fluxes
    3. keep only alerts with 1 detection

    Then, based on Sergey's analysis, we limit the analysis to
    the candidates outside the locus of variable stars (and bad subtractions).

    Parameters
    ----------
    df: Spark DataFrame
        Input dataframe containing alert data

    Returns
    ----------
    df_filt: Spark DataFrame
        Spark DataFrame of smaller size containing only potential
        tracklet candidate data based on the cuts.

    Examples
    ----------
    >>> df = spark.read.format('parquet').load(ztf_alert_sample)
    >>> df_filt = apply_tracklet_cuts(df)
    >>> df_filt.count()
    16
    """
    # remove known asteroids
    idx = df['candidate.ssnamenr'] == 'null'

    # Keep only objects unknown to SIMBAD - seems unnecessary
    # idx &= df['cdsxmatch'].isin(['Unknown'])

    # Keep only objects with 1 detection
    idx &= df['candidate.ndethist'] == 1

    # Keep only positive detections
    idx &= df['candidate.isdiffpos'] == 't'

    # Simple definition of locus containing (most of) stellar variability
    # as well as bad subtraction - basically, the variations are fainter than
    # the template object itself, and distance is smaller than typical FWHM
    shiftlog = F.log10(df['candidate.distnr']) + 0.2
    nidx = (df['candidate.magnr'] - df['candidate.magpsf']) < 1.0
    nidx &= (df['candidate.magnr'] - df['candidate.magpsf']) < (-4 * shiftlog)
    nidx &= df['candidate.distnr'] < 2

    df_filt = df.filter(idx & ~nidx).cache()

    return df_filt
示例#7
0
def indexing(df_data):
    df_data2 = df_data.select(df_data._id,
                              removepunctuations(df_data.text_entry))
    only_words = Tokenizer(inputCol='textentry', outputCol="words")
    df_data3 = only_words.transform(df_data2)
    df_data4 = df_data3.select(df_data3._id, df_data3.textentry,
                               explode(df_data3.words).alias('token_words'))
    term_freq = df_data4.groupBy("_id", "token_words").agg(
        count("token_words").alias("TF"))
    doc_freq = df_data4.groupBy("token_words").agg(
        countDistinct("_id").alias("DF"))
    idf_calc = doc_freq.withColumn('idf', (111396.0) / doc_freq['DF'])
    idf_calc = idf_calc.withColumn("IDF", log10("idf"))
    tf_idf = term_freq.join(idf_calc, "token_words",
                            "left").withColumn("TF-IDF",
                                               col("TF") * col("IDF"))
    return tf_idf
示例#8
0
def run_entity_extraction(srcFilePath, partName):
    sqlContext.clearCache()
    df = load_data(srcFilePath, data_table + "_" + partName)
    df.printSchema()
    # filter out invalid data
    # df = df.filter(df[entity].isNotNull())

    orderUserInfoDF = build_order_tbl(df)
    entityUserDF = build_entity_user_tbl(orderUserInfoDF)

    entityInfoDF = analyse_transactions(df)

    # add proj Wt
    weightColPerOrder = orderUserInfoDF.groupby(entity).agg(
        F.sum(weightCol).alias(weightCol))
    entityInfoDF = entityInfoDF.join(weightColPerOrder, entity)
    #add growthRate
    entityInfoDF = entityInfoDF.join(
        build_frac_growth_rate_tbl(orderUserInfoDF), entity, how="left_outer")

    print 'generating demographics'
    demographicTbl = generate_demographic_info(entityUserDF, userCols)
    entityInfoDF = entityInfoDF.join(demographicTbl, entity, how='left_outer')

    # finally add bunch of log values for selected cols
    cols_for_log = [
        x for x, y in entityInfoDF.dtypes
        if any(map(x.startswith, summaryOps['logs']))
        if y in ['double', 'float', 'int', 'long', 'bigint']
    ]
    print "gen log for cols", cols_for_log
    for col in cols_for_log:
        entityInfoDF = entityInfoDF.withColumn(
            "log_" + col,
            F.when(entityInfoDF[col] > 0, F.log10(col)).otherwise(0))

    # entityInfoDF = entityInfoDF.cache()
    entityInfoDF.repartition(1).write.mode("overwrite").format('com.databricks.spark.csv') \
        .options(header='true', mode="overwrite") \
        .save('/mnt/' + AWS_BUCKET_NAME + '/' + destEntityInfoPath)
    orderUserInfoDF.unpersist()
    entities = entityInfoDF.select(entity).map(lambda r: r[entity]).collect()
    return (entities, entityUserDF)
示例#9
0
def createTrans07(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberRealEstateLoansOrLines /(1+sparkDFTrans.NumberOfDependents))
		  .alias("RealEstateLoansPerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.age /(1+sparkDFTrans.NumberOfDependents))
		  .alias("YearsOfAgePerDependent")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.MonthlyIncome)
		  .alias("LogMonthlyIncome")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogMonthlyIncome"
		, when((sparkDFTrans.LogMonthlyIncome.isNull()) | (sparkDFTrans.LogMonthlyIncome.isNull()), 0)
		  .otherwise(sparkDFTrans.LogMonthlyIncome)
	)
	sparkDFTrans = sparkDFTrans.drop("MonthlyIncome")
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.NumberOfDependents))
		  .alias("LogIncomePerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.age))
		  .alias("LogIncomeAge")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTimesPastDue)
		  .alias("LogNumberOfTimesPastDue")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimesPastDue"
		, when(sparkDFTrans.LogNumberOfTimesPastDue.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTimesPastDue)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTimes90DaysLate)
		  .alias("LogNumberOfTimes90DaysLate")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimes90DaysLate"
		, when(sparkDFTrans.LogNumberOfTimes90DaysLate.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTimes90DaysLate)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse)
		  .alias("LogNumberOfTime3059DaysPastDueNotWorse")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime3059DaysPastDueNotWorse"
		, when(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse)
		  .alias("LogNumberOfTime6089DaysPastDueNotWorse")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime6089DaysPastDueNotWorse"
		, when(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse)
		  .alias("LogRatio90to3059DaysLate")
	)	
	return sparkDFTrans
示例#10
0
def createTrans06(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines))
		  .alias("LogRevolvingUtilizationOfUnsecuredLines")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogRevolvingUtilizationOfUnsecuredLines"
		, when(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines.isNull(), 0)
		  .otherwise(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines)
	)
	sparkDFTrans = sparkDFTrans.drop("RevolvingUtilizationOfUnsecuredLines")
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
		  .alias("DelinquenciesPerLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerLine"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0)
		  .otherwise(sparkDFTrans.DelinquenciesPerLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
		  .alias("MajorDelinquenciesPerLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerLine"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0)
		  .otherwise(sparkDFTrans.MajorDelinquenciesPerLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
		  .alias("MinorDelinquenciesPerLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerLine"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0)
		  .otherwise(sparkDFTrans.MinorDelinquenciesPerLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.RevolvingLines)
		  .alias("DelinquenciesPerRevolvingLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerRevolvingLine"
		, when(sparkDFTrans.RevolvingLines == 0, 0)
		  .otherwise(sparkDFTrans.DelinquenciesPerRevolvingLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.RevolvingLines)
		  .alias("MajorDelinquenciesPerRevolvingLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerRevolvingLine"
		, when(sparkDFTrans.RevolvingLines == 0, 0)
		  .otherwise(sparkDFTrans.MajorDelinquenciesPerRevolvingLine)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.RevolvingLines)
		  .alias("MinorDelinquenciesPerRevolvingLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerRevolvingLine"
		, when(sparkDFTrans.RevolvingLines == 0, 0)
		  .otherwise(sparkDFTrans.MinorDelinquenciesPerRevolvingLine)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans))
		  .alias("LogDebtPerLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberRealEstateLoansOrLines))
		  .alias("LogDebtPerRealEstateLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfDependents))
		  .alias("LogDebtPerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.RevolvingLines /(1+sparkDFTrans.NumberOfDependents))
		  .alias("RevolvingLinesPerPerson")
	)	
	return sparkDFTrans
示例#11
0
def createTrans02(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines == 0, 1)
		  .otherwise(0)
		  .alias("ZeroRevolvingUtilization")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("RevolvingUtilizationOfUnsecuredLines"
		, when(log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines) > 3, 0)
		  .otherwise(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.MonthlyIncome <= 1, 1)
		  .otherwise(sparkDFTrans.MonthlyIncome)
		  .alias("LogDebt")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogDebt"
		, log10(sparkDFTrans.LogDebt * sparkDFTrans.DebtRatio)
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogDebt"
		, when(sparkDFTrans.LogDebt.isNull(), 0)
		  .otherwise(sparkDFTrans.LogDebt)
	)	
	sparkDFTrans = sparkDFTrans.withColumn("RevolvingLines"
		, sparkDFTrans.NumberOfOpenCreditLinesAndLoans - sparkDFTrans.NumberRealEstateLoansOrLines
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.RevolvingLines > 0, 1)
		  .otherwise(0)
		  .alias("HasRevolvingLines")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberRealEstateLoansOrLines > 0, 1)
		  .otherwise(0)
		  .alias("HasRealEstateLoans")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberRealEstateLoansOrLines > 2, 1)
		  .otherwise(0)
		  .alias("HasMultipleRealEstateLoans")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.age > 60, 1)
		  .otherwise(0)
		  .alias("EligibleSS")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when((sparkDFTrans.NoIncome == 0) & (sparkDFTrans.DebtRatio > 0.33), 1)
		  .otherwise(0)
		  .alias("DTIOver33")
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, when((sparkDFTrans.NoIncome == 0) & (sparkDFTrans.DebtRatio > 0.43), 1)
		  .otherwise(0)
		  .alias("DTIOver43")
	)	
	return sparkDFTrans
示例#12
0
def parse_genebass_evidence(genebass_df: DataFrame) -> DataFrame:
    """
    Parse Genebass's disease/target evidence.
    Args:
        genebass_df: DataFrame with Genebass's portal data
    Returns:
        evd_df: DataFrame with Genebass's data following the t/d evidence schema.
    """
    to_keep = [
        'datasourceId',
        'datatypeId',
        'targetFromSourceId',
        'diseaseFromSource',
        'diseaseFromSourceId',
        'diseaseFromSourceMappedId',
        'pValueMantissa',
        'pValueExponent',
        'beta',
        'betaConfidenceIntervalLower',
        'betaConfidenceIntervalUpper',
        'oddsRatio',
        'oddsRatioConfidenceIntervalLower',
        'oddsRatioConfidenceIntervalUpper',
        'resourceScore',
        'ancestry',
        'ancestryId',
        'projectId',
        'cohortId',
        'studySampleSize',
        'studyCases',
        'statisticalMethod',
        'statisticalMethodOverview',
    ]

    # WARNING: There are some associations with a p-value of 0.0 in Genebass.
    # This is a bug we still have to ellucidate and it might be due to a float overflow.
    # These evidence need to be manually corrected in order not to lose them and for them to pass validation
    # As an interim solution, their p value will equal to the minimum in the evidence set.
    logging.warning(
        f"There are {genebass_df.filter(col('Pvalue_Burden') == 0.0).count()} evidence with a p-value of 0.0."
    )
    minimum_pvalue = (
        genebass_df.filter(col('Pvalue_Burden') > 0.0).agg({'Pvalue_Burden': 'min'}).collect()[0]['min(Pvalue_Burden)']
    )
    genebass_df = genebass_df.withColumn(
        'Pvalue_Burden', when(col('Pvalue_Burden') == 0.0, lit(minimum_pvalue)).otherwise(col('Pvalue_Burden'))
    )

    return (
        genebass_df.withColumn('datasourceId', lit('gene_burden'))
        .withColumn('datatypeId', lit('genetic_association'))
        .withColumn('projectId', lit('Genebass'))
        .withColumn('cohortId', lit('UK Biobank 450k'))
        .withColumn('ancestry', lit('EUR'))
        .withColumn('ancestryId', lit('HANCESTRO_0009'))
        .withColumnRenamed('gene_id', 'targetFromSourceId')
        .withColumnRenamed('description', 'diseaseFromSource')
        .withColumnRenamed('phenocode', 'diseaseFromSourceId')
        .join(
            import_trait_mappings(),
            on='diseaseFromSource',
            how='left',
        )
        .withColumnRenamed('Pvalue_Burden', 'resourceScore')
        .withColumn('pValueExponent', log10(col('resourceScore')).cast(IntegerType()) - lit(1))
        .withColumn('pValueMantissa', round(col('resourceScore') / pow(lit(10), col('pValueExponent')), 3))
        # Stats are split taking into consideration the type of the trait
        # Those that are not continuous or categorical were reviewed and all of them are considered as categorical
        .withColumn(
            'beta',
            when(col('trait_type') == 'continuous', col('BETA_Burden')),
        )
        .withColumn(
            'betaConfidenceIntervalLower',
            when(col('trait_type') == 'continuous', col('BETA_Burden') - col('SE_Burden')),
        )
        .withColumn(
            'betaConfidenceIntervalUpper',
            when(col('trait_type') == 'continuous', col('BETA_Burden') + col('SE_Burden')),
        )
        .withColumn(
            'oddsRatio',
            when(col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']), col('BETA_Burden')),
        )
        .withColumn(
            'oddsRatioConfidenceIntervalLower',
            when(
                col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']),
                col('BETA_Burden') - col('SE_Burden'),
            ),
        )
        .withColumn(
            'oddsRatioConfidenceIntervalUpper',
            when(
                col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']),
                col('BETA_Burden') + col('SE_Burden'),
            ),
        )
        .withColumn('studySampleSize', (col('n_cases') + coalesce('n_controls', lit(0))))
        .withColumnRenamed('n_cases', 'studyCases')
        .withColumnRenamed('annotation', 'statisticalMethod')
        .withColumn('statisticalMethodOverview', col('statisticalMethod'))
        .replace(to_replace=METHOD_DESC, subset=['statisticalMethodOverview'])
        .select(to_keep)
        .distinct()
    )
示例#13
0
print(data_reduce.select("GDP").rdd.max()[0])
print(data_reduce.select("GDP").rdd.min()[0])

# In[14]:

import math
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType
from pyspark.sql import functions as F

print(data_reduce.head())
for i in range(0, 17):
    if (i == 1):
        continue
    data_reduce = data_reduce.withColumn(
        data_reduce.columns[i], F.log10(col(data_reduce.columns[i]) + 1))

print(data_reduce.head())

# In[15]:

x = [i for i in range(0, data_reduce.count())]
get_ipython().run_line_magic('matplotlib', 'notebook')
for i in range(0, 17):
    plt.subplot(6, 3, i + 1)
    y = data_reduce.select(data_reduce.columns[i]).collect()
    plt.scatter(x, y, color='green', marker='o', edgecolor='black', alpha=0.5)
    plt.title(data_reduce.columns[i])
plt.show()

# # 6.3
示例#14
0
def createTrans01(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberOfDependents.isNull(), 1)
		  .otherwise(0)
		  .alias("UnknownNumberOfDependents")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.MonthlyIncome.isNull(), 1)
		  .otherwise(0)
		  .alias("UnknownMonthlyIncome")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberOfDependents == 0, 1)
		  .otherwise(0)
		  .alias("NoDependents")
	)
	sparkDFTrans = sparkDFTrans.withColumn("NumberOfDependents"
		, when(sparkDFTrans.UnknownNumberOfDependents == 1, 0)
		  .otherwise(sparkDFTrans.NumberOfDependents)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.MonthlyIncome == 0, 1)
		  .otherwise(0)
		  .alias("NoIncome")
	)
	sparkDFTrans = sparkDFTrans.withColumn("NoIncome"
		, when(sparkDFTrans.NoIncome.isNull(), 0).otherwise(sparkDFTrans.NoIncome)
	)
	sparkDFTrans = sparkDFTrans.withColumn("MonthlyIncome"
		, when(sparkDFTrans.UnknownMonthlyIncome == 1, 0)
		  .otherwise(sparkDFTrans.MonthlyIncome)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.DebtRatio == 0, 1)
		  .otherwise(0)
		  .alias("ZeroDebtRatio")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.DebtRatio == 0, 0)
		  .otherwise(sparkDFTrans.DebtRatio)
		  .alias("UnknownIncomeDebtRatio")
	)
	sparkDFTrans = sparkDFTrans.withColumn("DebtRatio"
		, when(sparkDFTrans.UnknownMonthlyIncome == 1, 0)
		  .otherwise(sparkDFTrans.DebtRatio)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines) > 3
			, sparkDFTrans.RevolvingUtilizationOfUnsecuredLines)
		.otherwise(0)
		.alias("WeirdRevolvingUtilization")
	)	
	return sparkDFTrans
示例#15
0
def main():

    # Args
    args = parse_args()
    # args.in_ld_folder = 'input_data/ld_each_variant'
    # args.in_manifest = 'input_data/190625/ld_analysis_input.tsv'
    # args.in_top_loci = 'input_data/190625/toploci.parquet'
    # args.out = 'output/ld_w_crediblesets.parquet'
    # args.min_r2 = 0.5

    # Make spark session
    global spark
    spark = (pyspark.sql.SparkSession.builder.config("spark.master",
                                                     "local[*]").getOrCreate())
    print('Spark version: ', spark.version)

    #
    # Load data ---------------------------------------------------------------
    #

    # Load LD
    ld = (
        load_ld(args.in_ld_folder).withColumn(
            'index_variant_id',
            regexp_replace(col('index_variant_id'), ':', '_')).withColumn(
                'tag_variant_id',
                regexp_replace(col('tag_variant_id'), ':', '_'))
        # .limit(10000) # Debug
    )

    # Load manifest
    manifest = (load_manifest(args.in_manifest).withColumnRenamed(
        'variant_id', 'index_variant_id'))

    #
    # Weight correlations by study population ---------------------------------
    #

    # Join LD to manifest
    data = manifest.join(ld, on='index_variant_id')

    # Replace R fields
    for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']:
        data = (
            data
            # Replace all R values == 1 with 0.9999995, otherwise we get error
            # This is reverted later by rounding to 6 dp
            .withColumn(coln,
                when(col(coln) == 1, 0.9999995).otherwise(col(coln))
            )
            # Fill nulls with 0
            .withColumn(coln,
                when(col(coln).isNull(), 0).otherwise(col(coln))
            )
        )

    # Fisher transform correlations to z-scores
    for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']:
        data = data.withColumn(coln.replace('R_', 'Z_'), arctanh(col(coln)))

    # Compute weighted average across populations
    data = data.withColumn(
        'Z_overall',
        ((col('AFR_prop') * col('Z_AFR')) + (col('AMR_prop') * col('Z_AMR')) +
         (col('EAS_prop') * col('Z_EAS')) + (col('EUR_prop') * col('Z_EUR')) +
         (col('SAS_prop') * col('Z_SAS'))))

    # Inverse Fisher transform weigthed z-score back to correlation
    data = data.withColumn('R_overall', tanh(col('Z_overall')))

    # Round R_overall to 6 dp
    data = data.withColumn('R_overall', round6dp(col('R_overall')))

    # Convert R to R2
    data = data.withColumn('R2_overall', pow(col('R_overall'), 2))

    # Drop rows where R2 is null
    data = data.filter(col('R2_overall').isNotNull())

    # Filter based on overall R2
    data = data.filter(col('R2_overall') >= args.min_r2)

    # Drop unneeded columns
    data = data.drop(*[
        'Z_overall', 'R_overall', 'R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS',
        'Z_AFR', 'Z_AMR', 'Z_EAS', 'Z_EUR', 'Z_SAS', 'index_variant_id'
    ])

    # Denormalise variant IDs
    data = (data.withColumnRenamed('chrom', 'lead_chrom').withColumnRenamed(
        'pos', 'lead_pos').withColumnRenamed(
            'ref', 'lead_ref').withColumnRenamed('alt', 'lead_alt').withColumn(
                'tag_split', split(col('tag_variant_id'), '_')).withColumn(
                    'tag_chrom',
                    col('tag_split').getItem(0)).withColumn(
                        'tag_pos',
                        col('tag_split').getItem(1).cast('int')).withColumn(
                            'tag_ref',
                            col('tag_split').getItem(2)).withColumn(
                                'tag_alt',
                                col('tag_split').getItem(3)).drop(
                                    'tag_split', 'tag_variant_id'))

    #
    # Conduct credible set analysis using PICS adjustment ---------------------
    #
    ''' Probabilistic Identification of Causal SNPs (PICS) from Farh (2014):
            https://www.nature.com/articles/nature13835

        Adjusts the p-values for tag SNPs based on the p-value of the lead SNP
        and it's LD.
    '''

    # Empiric constant that can be adjusted to fit the curve, 6.4 recommended.
    k = 6.4

    # Load toploci
    toploci = spark.read.parquet(args.in_top_loci)

    # Join negative log pvalue from toploci onto data
    toploci = (toploci.withColumn(
        'neglog_p',
        -1 * (log10(col('pval_mantissa')) +
              col('pval_exponent'))).withColumnRenamed(
                  'chrom', 'lead_chrom').withColumnRenamed(
                      'pos', 'lead_pos').withColumnRenamed(
                          'ref', 'lead_ref').withColumnRenamed(
                              'alt',
                              'lead_alt').select('study_id', 'lead_chrom',
                                                 'lead_pos', 'lead_ref',
                                                 'lead_alt', 'neglog_p'))
    data = data.join(
        toploci,
        on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt'])

    # Calculate PICS statistics
    data = (data.withColumn('pics_mu',
                            col('R2_overall') * col('neglog_p')).withColumn(
                                'pics_std',
                                sqrt(1 - sqrt(col('R2_overall'))**k) *
                                sqrt(col('neglog_p')) / 2).withColumn(
                                    'pics_relative_prob',
                                    when(col('pics_std') == 0,
                                         1.0).otherwise(
                                             norm_sf(col('pics_mu'),
                                                     col('pics_std'),
                                                     col('neglog_p')))))

    # Calculate the sum of the posterior probabilities at each locus
    pics_prob_sums = (data.groupby(
        'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').agg(
            sum('pics_relative_prob').alias('pics_relative_prob_sum')))

    # Merge back onto data
    data = data.join(
        pics_prob_sums,
        on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt'])

    # Calculate posterior probability at each locus
    data = (data.withColumn(
        'pics_postprob',
        col('pics_relative_prob') / col('pics_relative_prob_sum')).drop(
            'pics_relative_prob_sum', 'neglog_p'))

    # Calculate cumulative sum per locus
    window_spec = (Window.partitionBy('study_id', 'lead_chrom', 'lead_pos',
                                      'lead_ref', 'lead_alt').orderBy(
                                          desc('pics_postprob')).rowsBetween(
                                              Window.unboundedPreceding,
                                              Window.currentRow))
    data = (data.withColumn('pics_postprob_cumsum',
                            sum('pics_postprob').over(window_spec)))

    # Label whether each row is in the 95 and 99% credible sets
    window_spec = (Window.partitionBy(
        'study_id', 'lead_chrom', 'lead_pos', 'lead_ref',
        'lead_alt').orderBy('pics_postprob_cumsum'))
    data = (data.withColumn(
        'pics_95perc_credset',
        when(lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.95,
             False).otherwise(True)).withColumn(
                 'pics_99perc_credset',
                 when(
                     lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.99,
                     False).otherwise(True)))

    #
    # Write output ------------------------------------------------------------
    #

    # Rename columns and format
    data = (data.withColumnRenamed(
        'AFR_prop', 'AFR_1000G_prop').withColumnRenamed(
            'AMR_prop', 'AMR_1000G_prop').withColumnRenamed(
                'EAS_prop', 'EAS_1000G_prop').withColumnRenamed(
                    'EUR_prop', 'EUR_1000G_prop').withColumnRenamed(
                        'SAS_prop', 'SAS_1000G_prop').withColumnRenamed(
                            'R2_overall', 'overall_r2').select(
                                'study_id', 'lead_chrom', 'lead_pos',
                                'lead_ref', 'lead_alt', 'tag_chrom', 'tag_pos',
                                'tag_ref', 'tag_alt', 'overall_r2', 'pics_mu',
                                'pics_postprob', 'pics_95perc_credset',
                                'pics_99perc_credset', 'AFR_1000G_prop',
                                'AMR_1000G_prop', 'EAS_1000G_prop',
                                'EUR_1000G_prop', 'SAS_1000G_prop'))

    # Save output
    (data.repartitionByRange('study_id', 'lead_chrom',
                             'lead_pos').write.parquet(args.out,
                                                       mode='overwrite'))

    return 0
示例#16
0
                   sf.lit(h3_resolution)))

    # group by the h3 grid
    .groupBy("h3")

    # grab counts
    .count()

    # add the centroid
    .withColumn("h3_centroid", h3_hex_centroid(sf.col("h3")))

    # rename columns
    .select("h3", "count",
            sf.col("h3_centroid.lat").alias("lat"),
            sf.col("h3_centroid.lon").alias("lon"),
            sf.log10("count").alias("log_count"))

    # ensure sparse representation
    .filter(sf.col("count") > sf.lit(0)))

# COMMAND ----------

# MAGIC %md __Convert results to Pandas__

# COMMAND ----------

pandas_df = h3_aggregation.toPandas()

# COMMAND ----------

# MAGIC %md __Export results to CSV__
示例#17
0
def mysolution(sc,spark):
	# Load data/DataSample.csv to Spark DataFrame

	df_dataSample = spark.read.option("header",True).csv("data\DataSample.csv")
	print('\nDisplay Schema of DataSample.csv dataset table\n')
	df_dataSample.printSchema()


	# In[9]:


	#Display the contents of DataSample data
	print('\nDisplay contents of DataSample.csv dataset table\n')
	df_dataSample.show()


	# ### 1. Cleanup
	# 
	# A sample dataset of request logs is given in data/DataSample.csv. We consider records that have identical geoinfo and timest as suspicious. Please clean up the sample dataset by filtering out those suspicious request records.

	# In[22]:


	# Drop duplicate rows based on columns TimeSt, Latitude and Longitude

	df_clean = df_dataSample.dropDuplicates(['Latitude', 'Longitude']).dropDuplicates([' TimeSt'])
	print ('\nDisplay clean dataset after dropping suspicius requests (i.e., duplicate geoinfo and timest)\n')
	df_clean.show()

	print ("\n------------------------END OF ANSWER #1------------------------\n")
	# **End of Answer #1**
	# 
	# ---

	# ### 2. Label
	# Assign each request (from data/DataSample.csv) to the closest (i.e. minimum distance) POI (from data/POIList.csv).
	# 
	# **Note:** A POI is a geographical Point of Interest.

	# In[23]:


	#Load data from data/POIList.csv in Spark Dataframe

	df_poil = spark.read.option("header",True).csv("data\POIList.csv")
	
	print ('\nDisplay Schema and data of POIList dataset table\n')
	df_poil.printSchema()
	df_poil.show(5)


	# In[24]:


	#Convert pois Spark DataFrame to Pandas Dataframe
	df_pd_pois = df_poil.toPandas()


	# In[69]:


	#Python-UDF to find POI with minimum distance to each entry of DataSample
	def myfun(la2, lo2):
		
		min_dis = 1.0e10
		poi_id = df_pd_pois.loc[0,'POIID']
		
		for i, (la1,lo1) in enumerate( zip(df_pd_pois[' Latitude'], df_pd_pois['Longitude'])):
			la1, lo1 = float(la1), float(lo1)
			dis = math.sqrt((la1-la2)**2 + (lo1-lo2)**2)
			if min_dis > dis:
				min_dis = dis
				poi_id = df_pd_pois.loc[i,'POIID']
				
		return ([poi_id, min_dis])

	#Register Python-UDF with Spark-UDF
	myfun_spark = F.udf(myfun, ArrayType(StringType()))

	df_poi = df_clean.withColumn('temp_col', myfun_spark(  F.col('Latitude').cast(FloatType()),
											  F.col('Longitude').cast(FloatType())  )).cache()\
				.withColumn('POI', F.col('temp_col')[0])\
				.withColumn('POI_DIS', F.col('temp_col')[1].cast(DoubleType()))\
				.drop('temp_col')

	print('Display the dataframe with new columns of nearest POI and POI_DIS(i.e, distance to POI from request)')
	df_poi.show(5)

	print ("\n------------------------END OF ANSWER #2------------------------\n")
	# **End of Answer #2**
	# 
	# ---

	# ### 3. Analysis
	# For each POI, calculate the average and standard deviation of the distance between the POI to each of its assigned requests.
	# 
	# At each POI, draw a circle (with the center at the POI) that includes all of its assigned requests. Calculate the radius and density (requests/area) for each POI.

	# In[70]:


	#Group the dataframe df_poi on 'POI' column and calculate average and standard deviation on each group
	df_avgSD = df_poi.groupby('POI').agg(F.avg('POI_DIS').alias('Average'), F.stddev('POI_DIS').alias('Std_Dev'))

	#Left Join df_avgSD dataframe to df_poil dataframe for completeness
	df_avgSD = df_poil.join(df_avgSD, df_poil.POIID == df_avgSD.POI, how = 'Left').drop(df_avgSD.POI)

	print('Display distance Average and Std_Dev for each POI')
	df_avgSD.show()

	print ("Note: Based on above output, it can be concluded that POI2 radius of influence is ZERO\n")
	# **Note:** Based on above output, it can be concluded that POI2 radius of influence is ZERO

	# In[71]:


	#The radius of Influence-Circle of POI will be the distance to farthest assigned request

	w = Window.partitionBy('POI')

	df_radius = df_poi.withColumn('max_r', F.max('POI_DIS').over(w))                  .where(F.col('POI_DIS') == F.col('max_r'))                  .drop('max_r')

	#Left Join df_radius dataframe to df_poil dataframe for completeness
	df_avgSD_r = df_avgSD.join(df_radius['POI', 'POI_DIS'], df_avgSD.POIID == df_radius.POI, how = 'Left')                   .drop(df_radius.POI)                   .withColumnRenamed('POI_DIS', 'POI_RADIUS')

	print('Display the maximum POI_DIS (i.e, POI_RADIUS) values for each group\n')
	df_avgSD_r.show()


	# In[72]:


	#Calculate number of requests for each POI
	df_no_of_req = df_poi.groupby('POI').agg(F.count('POI').alias('Requests'))

	#Append POI_No.
	df_poi_req = df_avgSD_r.join(df_no_of_req, df_avgSD_r.POIID == df_no_of_req.POI, 'Left' )                         .drop(df_no_of_req['POI'])

	#Calculate the density
	df_poi_density = df_poi_req.withColumn('Density', F.col('Requests')/ (3.14*F.col('POI_RADIUS')**2 ))

	print('Dislay No. of Requests and Density for each POI')
	df_poi_density.show()

	print ("\n------------------------END OF ANSWER #3------------------------\n")
	# **End of Answer #3**
	# 
	# ---

	# ### 4. Data Science/Engineering Tracks
	# Please complete either 4a or 4b. Extra points will be awarded for completing both tasks.
	# 
	# #### 4a. Model
	# To visualize the popularity of each POI, they need to be mapped to a scale that ranges from -10 to 10. Please provide a mathematical model to implement this, taking into consideration of extreme cases and outliers. Aim to be more sensitive around the average and provide as much visual differentiability as possible.
	# Bonus: Try to come up with some reasonable hypotheses regarding POIs, state all assumptions, testing steps and conclusions. Include this as a text file (with a name bonus) in your final submission.

	# In[61]:


	#Import PySpark Libraries for Data Analytics
	from pyspark.ml.feature import MinMaxScaler
	from pyspark.ml.feature import VectorAssembler
	from pyspark.ml import Pipeline


	# In[88]:


	df_poi_density_temp = df_poi_density.filter(df_poi_density.Density.isNotNull())
	#df_poi_density_temp.show()


	# In[109]:


	# Spark-udf for converting column from vector type to double type
	myfun_vec2double = F.udf(lambda x: round(float(list(x)[0]),3), DoubleType())

	# Use Spark VectorAssembler Transformation - Converting column to vector type
	assembler = VectorAssembler(inputCols=['Density'],outputCol="Density_Vector")

	# Use Spark MinMaxScaler Transformation to scale the column within (min,max) range
	scaler = MinMaxScaler(min = -10, max = 10, inputCol="Density_Vector", outputCol="Density_Scaled")

	# Create a Spark Pipeline of VectorAssembler and MinMaxScaler
	pipeline = Pipeline(stages=[assembler, scaler])

	#Drop POI2 as outlier 
	df_poi_density_temp = df_poi_density.filter(df_poi_density.Density.isNotNull())

	# Spark fitting pipeline on dataframe
	df_norm = pipeline.fit(df_poi_density_temp).transform(df_poi_density_temp).withColumn("Density_Scaled", myfun_vec2double("Density_Scaled")).drop("Density_Vector")

	print('Display scaled density for each POI')
	df_norm.select(*['POIID'], *[F.round(c, 3).alias(c) for c in df_norm.columns[1:] ]).show()


	# In[112]:


	df_lognorm = df_norm.withColumn('log_Density', F.log10(F.col('Density')) )

	# Use Spark VectorAssembler Transformation - Converting column to vector type
	assembler_log = VectorAssembler(inputCols=['log_Density'],outputCol="log_Density_Vector")

	# Use Spark MinMaxScaler Transformation to scale the column within (min,max) range
	scaler_log = MinMaxScaler(min = -1.0, max = 1.0, inputCol="log_Density_Vector", outputCol="log_Density_Scaled")

	# Create a Spark Pipeline of VectorAssembler and MinMaxScaler
	pipeline_log = Pipeline(stages=[assembler_log, scaler_log])


	# Spark fitting pipeline on dataframe
	df_lognorm = pipeline_log.fit(df_lognorm).transform(df_lognorm)                  .withColumn("log_Density_Scaled", myfun_vec2double("log_Density_Scaled"))                  .drop("log_Density_Vector")

	print('Display scaled log_density for each POI')
	df_lognorm.select(*['POIID'], *[F.round(c, 3).alias(c) for c in df_lognorm.columns[1:] ]).show()


	#Save the interpretation on results in 'bonus' file
	bonus = """
	Interpretation:
	Density column is the ratio of Requests to POI_Area. log_Density was calculated by taking log10 of Density values. log_Density were scaled in range (-10,10) to calculate log_Density_Scaled.

	It is difficult to come up with a statitics with only 3 good POIs.

	Nonetheless, the density values of POI1 and POI3 are 3 orders higher than POI4. Hence, Density_Scaled, log_Density and log_Density_Scaled values are also skewed.
	POI1 and POI3 attract more customers or requests per unit area of influence.

	Assumptions: POI2 was dropped as outlier. POI2 data must be investigated to identify the cause of zero zone of influence. Bad data collection and formatting can be reasons for POI2 being outlier
	"""

	with open('bonus', 'w') as f:
		f.write(bonus)

	f.close()

	# 
	# **Interpretation:**
	# Density column is the ratio of Requests to POI_Area. log_Density was calculated by taking log10 of Density values. log_Density were scaled in range (-10,10) to calculate log_Density_Scaled.
	# 
	# It is difficult to come up with a statitics with only 3 good POIs.
	# 
	# Nonetheless, the density values of POI1 and POI3 are 3 orders higher than POI4. Hence, Density_Scaled, log_Density and log_Density_Scaled values are also skewed.
	# POI1 and POI3 attract more customers or requests per unit area of influence.
	# 
	# **Assumptions:** POI2 was dropped as outlier. POI2 data must be investigated to identify the cause of zero zone of influence. Bad data collection and formatting can be reasons for POI2 being outlier

	print ("\n------------------------END OF ANSWER #4a------------------------\n")
	# **End of Answer #4a**
	# 
	# ----

	# #### 4b. Pipeline Dependency
	# We use a modular design on all of our data analysis tasks. To get to a final product, we organize steps using a data pipeline. One task may require the output of one or multiple other tasks to run successfully. This creates dependencies between tasks.
	# 
	# We also require the pipeline to be flexible. This means a new task may enter a running pipeline anytime that may not have the tasks' dependencies satisfied. In this event, we may have a set of tasks already running or completed in the pipeline, and we will need to map out which tasks are prerequisites for the newest task so the pipeline can execute them in the correct order. For optimal pipeline execution, when we map out the necessary tasks required to execute the new task, we want to avoid scheduling tasks that have already been executed.
	# 
	# If we treat each task as a node and the dependencies between a pair of tasks as directed edges, we can construct a DAG (Wiki: Directed Acyclic Graph).
	# 
	# Consider the following scenario. At a certain stage of our data processing, we have a set of tasks (starting tasks) that we know all its prerequisite task has been executed, and we wish to reach to a later goal task. We need to map out a path that indicates the order of executions on tasks that finally leads to the goal task. We are looking for a solution that satisfies both necessity and sufficiency -- if a task is not a prerequisite task of goal, or its task is a prerequisite task for starting tasks (already been executed), then it shouldn't be included in the path. The path needs to follow a correct topological ordering of the DAG, hence a task needs to be placed behind all its necessary prerequisite tasks in the path.
	# 
	# Note: A starting task should be included in the path, if and only if it's a prerequisite of the goal task
	# 
	# For example, we have 6 tasks [A, B, C, D, E, F], C depends on A (denoted as A->C), B->C, C->E, E->F. A new job has at least 2 tasks and at most 6 tasks, each task can only appear once.
	# 
	# Examples:
	# 
	# Inputs: starting task: A, goal task: F, output: A,B,C,E,F or B,A,C,E,F.
	# Input: starting task: A,C, goal task:'F', outputs: C,E,F.
	# You will find the starting task and the goal task in question.txt file, list of all tasks in task_ids.txt and dependencies in relations.txt.
	# 
	# Please submit your implementation and result.

	# In[113]:


	#Assign questions data
	questions = {'starting task': '73', 'goal task': '36'}
	questions


	# In[114]:


	#Assign relations data
	relations = [(97,102),
				 (75,31),
				 (75,37),
				 (100,20),
				 (102,36),
				 (102,37),
				 (102,31),
				 (16,37),
				 (39,73),
				 (39,100),
				 (41,73),
				 (41,112),
				 (62,55),
				 (112,97),
				 (20,94),
				 (20,97),
				 (21,20),
				 (73,20),
				 (56,102),
				 (56,75),
				 (56,55),
				 (55,31),
				 (55,37),
				 (94,56),
				 (94,102)]


	# In[117]:


	#Assign Task-IDs data
	task_ids = [97,75,100,102,16,39,41,62,112,20,21,73,56,55,36,37,94,31]


	# In[118]:


	#Create a pandas-dataframe of relations data
	r_pd = pd.DataFrame(relations, columns = ['from', 'to'])
	#r_pd.head()


	# In[119]:


	#Get starting target (st) and goal target (gt)
	st = int(questions['starting task']); print ('Starting Task: %2d'%(st))
	gt = int(questions['goal task']); print ('Goal Task: %2d'%(gt))


	# In[171]:


	#A python recursive function to find the path from source to target
	def replicate_recur(st, gt, mylist=None):

		# If a list has not been passed as argument create an empty one
		if(mylist == None):
			mylist = [st]
			
		if st == gt:
			return mylist
		
		temp = r_pd[r_pd['from'] == st].values

		if not temp.any() :
			temp = 'Error'
			mylist.append(temp)
			return mylist
		
		mylist = [ [i for i in mylist] for _ in range(len(temp))]
		for idx,val in enumerate(temp[:,1]):
			mylist[idx].append(val)
			mylist[idx] = replicate_recur(val, gt, mylist[idx])

		return mylist

	output = []
	def removeNestings(l): 
		for i in l: 
			if (type(i) == list) & (type(i[0]) == list):
				removeNestings(i) 
			elif ('Error' not in i):
				output.append(i)

	print ('\nThe different paths from Starting Target to Goal Target\n')
	removeNestings([replicate_recur(st, gt)])
	pprint(output)

	print ("\n------------------------END OF ANSWER #4b------------------------\n")
示例#18
0
# MAGIC %md
# MAGIC
# MAGIC #### Run Quality Control
# MAGIC
# MAGIC Perform variant-wise filtering on Hardy-Weinberg equilibrium P-values and allele frequency

# COMMAND ----------

hwe = (spark.read.format("delta").load(delta_silver_path).where(
    (fx.col("alleleFrequencies").getItem(0) >= allele_freq_cutoff)
    & (fx.col("alleleFrequencies").getItem(0) <=
       (1.0 - allele_freq_cutoff))).withColumn(
           "log10pValueHwe",
           fx.when(fx.col("pValueHwe") == 0,
                   26).otherwise(-fx.log10(fx.col("pValueHwe")))))

# COMMAND ----------

hwe_cutoff = calculate_pval_bonferroni_cutoff(hwe)
mlflow.log_param("Hardy-Weinberg P value cutoff", hwe_cutoff)

# COMMAND ----------

display(
    plot_histogram(df=hwe.select("log10pValueHwe"),
                   col="log10pValueHwe",
                   xlabel='-log_{10}(P)',
                   xmin=0,
                   xmax=25,
                   nbins=50,
示例#19
0
文件: gwas.py 项目: pantonim11/glow
# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC #### Run Quality Control
# MAGIC 
# MAGIC Perform variant-wise filtering on Hardy-Weinberg equilibrium P-values and allele frequency

# COMMAND ----------

hwe = (spark.read.format("delta")
                 .load(delta_silver_path)
                 .where((fx.col("alleleFrequencies").getItem(0) >= allele_freq_cutoff) & 
                       (fx.col("alleleFrequencies").getItem(0) <= (1.0 - allele_freq_cutoff)))
                 .withColumn("log10pValueHwe", fx.when(fx.col("pValueHwe") == 0, 26).otherwise(-fx.log10(fx.col("pValueHwe")))))

# COMMAND ----------

hwe_cutoff = calculate_pval_bonferroni_cutoff(hwe)
mlflow.log_param("Hardy-Weinberg P value cutoff", hwe_cutoff)

# COMMAND ----------

display(plot_histogram(df=hwe.select("log10pValueHwe"), 
                       col="log10pValueHwe",
                       xlabel='-log_{10}(P)',
                       xmin=0, 
                       xmax=25, 
                       nbins=50, 
                       plot_title="hardy-weinberg equilibrium", 
示例#20
0
def createTrans08(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse)
		  .alias("LogRatio90to6089DaysLate")
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans > 0, 1).otherwise(0) 
		  .alias("AnyOpenCreditLinesOrLoans")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.NumberOfOpenCreditLinesAndLoans))
		  .alias("LogNumberOfOpenCreditLinesAndLoans")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfOpenCreditLinesAndLoans"
		, when(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans - log1p(sparkDFTrans.NumberOfDependents))
		  .alias("LogNumberOfOpenCreditLinesAndLoansPerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberOfDependents > 0, 1).otherwise(0) 
		  .alias("HasDependents")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log1p(sparkDFTrans.NumberOfDependents) 
		  .alias("LogHouseholdSize")
	)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfDependents)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.DebtRatio) 
		  .alias("LogDebtRatio")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogDebtRatio"
		, when(sparkDFTrans.LogDebtRatio.isNull(), 0)
		  .otherwise(sparkDFTrans.LogDebtRatio)
	)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.DebtRatio)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimesPastDue)) 
		  .alias("LogDebtPerDelinquency")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) 
		  .alias("LogDebtPer90DaysLate")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.UnknownIncomeDebtRatio)) 
		  .alias("LogUnknownIncomeDebtRatio")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogUnknownIncomeDebtRatio"
		, when(sparkDFTrans.LogUnknownIncomeDebtRatio.isNull(), 0)
		  .otherwise(sparkDFTrans.LogUnknownIncomeDebtRatio)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - sparkDFTrans.LogHouseholdSize) 
		  .alias("LogUnknownIncomeDebtRatioPerPerson")
	)
	return sparkDFTrans
示例#21
0
def getTrendRows(df, attrbs, targColumn, targVal, recordsNo, targVals_counts,
                 depth, parent_attrb):

    trends = []

    #print parent_attrb

    #print depth

    #Edit this for number of recursive levels
    if depth == 3:
        return

    for attrb in attrbs:

        df_t = df.groupBy(attrb).pivot(targColumn).count()
        df_t = df_t.select(attrb, str(targVal))

        df_t = df_t.withColumn('Support_' + str(targVal),
                               col(str(targVal)) / float(recordsNo))
        df_t = df_t.withColumn(
            'Confidence_' + str(targVal),
            col(str(targVal)) / float(targVals_counts[targVal]))
        df_t = df_t.withColumn(
            'Lift_' + str(targVal),
            col('Confidence_' + str(targVal)) /
            (targVals_counts[targVal] / float(recordsNo)))
        df_t = df_t.withColumn(
            'Metric_' + str(targVal),
            col('Support_' + str(targVal)) *
            log10(col('Lift_' + str(targVal))))

        scores = df_t.collect()
        scores = [[attrb] + list(x) for x in scores]
        trends = trends + scores

        #Adjust [:3] for top x rules
        trends = sorted(trends, key=lambda x: x[-1], reverse=True)[:3]

    for trend in trends:

        df_e = df.filter(col(trend[0]) == trend[1])
        df_e = df_e.drop(trend[0])
        tmp_attrbs = attrbs

        #print tmp_attrbs
        #print trend

        tmp_attrbs = [x for x in attrbs if x != trend[0]]

        parent_attrb.append(str(trend[0]) + ' == ' + str(trend[1]))

        #Getting row to be inserted in dataframe
        z = parent_attrb + trend[-5:]
        z[0:-5] = [' & '.join(z[0:-5])]

        global_trends.append(z)

        getTrendRows(df_e, tmp_attrbs, targColumn, targVal, recordsNo,
                     targVals_counts, depth + 1, parent_attrb)
        parent_attrb.pop()

    return global_trends
	df2 = df2.withColumn("text_entry", split("text_entry", " "))

	#Explode eachtext_entry value into multiple rows to get _id with each word of text_entry
	df2 = df2.withColumn("token", explode(col("text_entry")))

	#Calculating Term Frequency by grouping based on ‘_id’ and ‘token’ and counting how many times each token occurs in each document
	df_tf = df2.groupby("_id", "token").agg(F.count("text_entry").alias("tf"))

	#Calculating Document Frequency by grouping on each token and counting the number of documents it occurs in
	df_idf = df2.groupby("token").agg(F.countDistinct("_id").alias("df"))

	#Converting ‘df’ column to Double Type in order for easy calculation later on
	df_idf = df_idf.withColumn("df", df_idf["df"].cast(DoubleType()))

	#Calculating IDF values
	df_idf = df_idf.withColumn("idf", F.log10(N/df_idf["df"]))

	#Joining df_tf and df_idf based on token columns
	tokensWithTfIdf = df_tf.join(df_idf, df_tf["token"] == df_idf["token"], how='left').drop(df_idf["token"])

	#Calculating TF-IDF Score
	tokensWithTfIdf = tokensWithTfIdf.withColumn("tf_idf", col("tf") * col("idf"))

	#Change ordering of Columns & Caching the Inverted Index
	tokensWithTfIdf = tokensWithTfIdf.select("token", "_id", "tf", "df", "idf", "tf_idf")
	print("\n")

	#Showing the top 20 rows of the Inverted Index
	tokensWithTfIdf.show()

	#Caching the Inverted Index for further usage
示例#23
0
def parse_az_phewas_evidence(az_phewas_df: DataFrame) -> DataFrame:
    """
    Parse Astra Zeneca's PheWAS Portal evidence.
    Args:
        az_phewas_df: DataFrame with Astra Zeneca's PheWAS Portal data
    Returns:
        evd_df: DataFrame with Astra Zeneca's data following the t/d evidence schema.
    """
    to_keep = [
        'datasourceId',
        'datatypeId',
        'allelicRequirements',
        'targetFromSourceId',
        'diseaseFromSource',
        'diseaseFromSourceMappedId',
        'pValueMantissa',
        'pValueExponent',
        'beta',
        'betaConfidenceIntervalLower',
        'betaConfidenceIntervalUpper',
        'oddsRatio',
        'oddsRatioConfidenceIntervalLower',
        'oddsRatioConfidenceIntervalUpper',
        'resourceScore',
        'ancestry',
        'ancestryId',
        'literature',
        'projectId',
        'cohortId',
        'studySampleSize',
        'studyCases',
        'studyCasesWithQualifyingVariants',
        'statisticalMethod',
        'statisticalMethodOverview',
    ]

    return (az_phewas_df.withColumn(
        'datasourceId', lit('gene_burden')).withColumn(
            'datatypeId', lit('genetic_association')).withColumn(
                'literature', array(lit('34375979'))).withColumn(
                    'projectId', lit('AstraZeneca PheWAS Portal')).withColumn(
                        'cohortId', lit('UK Biobank 450k')).withColumnRenamed(
                            'Gene', 'targetFromSourceId').
            withColumnRenamed('Phenotype', 'diseaseFromSource').join(
                import_trait_mappings(),
                on='diseaseFromSource',
                how='left',
            ).withColumn('resourceScore', col('pValue')).withColumn(
                'pValueExponent',
                log10(col('pValue')).cast(IntegerType()) - lit(1)).withColumn(
                    'pValueMantissa',
                    round(
                        col('pValue') / pow(lit(10), col('pValueExponent')),
                        3)).withColumn(
                            'beta',
                            when(col('Type') == 'Quantitative', col('beta')),
                        ).withColumn(
                            'betaConfidenceIntervalLower',
                            when(col('Type') == 'Quantitative', col('LCI')),
                        ).withColumn(
                            'betaConfidenceIntervalUpper',
                            when(col('Type') == 'Quantitative', col('UCI')),
                        ).withColumn(
                            'oddsRatio',
                            when(col('Type') == 'Binary', col('binOddsRatio')),
                        ).withColumn(
                            'oddsRatioConfidenceIntervalLower',
                            when(col('Type') == 'Binary', col('LCI')),
                        ).withColumn(
                            'oddsRatioConfidenceIntervalUpper',
                            when(col('Type') == 'Binary', col('UCI')),
                        ).withColumn('ancestry', lit('EUR')).withColumn(
                            'ancestryId',
                            lit('HANCESTRO_0005')).withColumnRenamed(
                                'nSamples',
                                'studySampleSize').withColumnRenamed(
                                    'nCases', 'studyCases').withColumnRenamed(
                                        'nCasesQV',
                                        'studyCasesWithQualifyingVariants').
            withColumnRenamed(
                'CollapsingModel', 'statisticalMethod').withColumn(
                    'statisticalMethodOverview',
                    col('statisticalMethod')).replace(
                        to_replace=METHOD_DESC,
                        subset=['statisticalMethodOverview']).withColumn(
                            'allelicRequirements',
                            when(
                                col('statisticalMethod') == 'rec',
                                array(lit('recessive'))).otherwise(
                                    array(lit('dominant'))),
                        ).select(to_keep).distinct())
示例#24
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
示例#25
0
def compile_log10(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.log10(src_column)
                                 how="left_outer")

# COMMAND ----------

#check if demographics are needed
if demographics:
    print 'generating demographics'
    demographicTbl = generate_demographic_info(orderUserInfoDF, userCols)
    entityInfoDF = entityInfoDF.join(demographicTbl, entity, how='left_outer')

# COMMAND ----------

# finally add bunch of log values for selected cols
cols_for_log = [
    x for x, y in entityInfoDF.dtypes
    if any(map(x.startswith, summaryOps['logs']))
    if y in ['double', 'float', 'int', 'long', 'bigint']
]
print "gen log for cols", cols_for_log
for col in cols_for_log:
    entityInfoDF = entityInfoDF.withColumn(
        "log_" + col,
        F.when(entityInfoDF[col] > 0, F.log10(col)).otherwise(0))

# COMMAND ----------

entityInfoDF = entityInfoDF.cache()
entityInfoDF.repartition(1).write.mode("overwrite").format('com.databricks.spark.csv') \
    .options(header='true', mode="overwrite") \
    .save('/mnt/' + AWS_BUCKET_NAME + '/' + destEntityInfoPath)