def generateCanonicalName(self, dataFrame, filename): logger.info('in generateCanonicalName') mapDict = self.createDictfromMapfile(filename) def gen(text): Manufacturer = mapDict.get(text, -1) if Manufacturer != -1: return Manufacturer else: return 'UNKNOWN' udf = Functions.UserDefinedFunction(lambda x: gen(x)) localDf = dataFrame.groupBy( Functions.col("Cleaned Manufacturer").alias( 'Cleaned Manufacturer2')).agg(Functions.lit('1')) localDf = localDf.withColumn( "Canonical_Manufacturer", udf(Functions.col("Cleaned Manufacturer2"))).drop('1') dataFrame = dataFrame.join(localDf, (dataFrame['Cleaned Manufacturer'] == localDf['Cleaned Manufacturer2'])).drop( localDf['Cleaned Manufacturer2']) return dataFrame
def calc_TX_PVLS(patient_agg_obs: DataFrame, VL_code: str, end_date_str: str = None) -> pandas.DataFrame: """Calculates TX_PVLS indicator with its corresponding disaggregations. Args: patient_agg_obs: A DataFrame generated by `join_patients_agg_obs()`. VL_code: The code for viral load values. end_date: The string representation of the last date as 'YYYY-MM-DD'. Returns: """ end_date = datetime.today() if end_date_str: end_date = datetime.strptime(end_date_str, '%Y-%m-%d') agg_buckets_udf = F.UserDefinedFunction( lambda a, g: agg_buckets(a, g, end_date), T.ArrayType(T.StringType())) VL_df = patient_agg_obs.withColumn( 'sup_VL', patient_agg_obs[VL_code + '_max_value'] < 150).withColumn( 'agg_buckets', agg_buckets_udf(patient_agg_obs['birthDate'], patient_agg_obs['gender'])) num_patients = VL_df.count() VL_agg_P = VL_df.select( VL_df.sup_VL, F.explode(VL_df.agg_buckets).alias('agg_bucket')).groupBy( 'sup_VL', 'agg_bucket').agg( F.count('*').alias('count')).toPandas().sort_values( ['agg_bucket', 'sup_VL']) VL_agg_P['ratio'] = VL_agg_P['count'] / num_patients return VL_agg_P
def process_income_csv(income_csv_arg): income_df = spark.read.csv(income_csv_arg) income_df = income_df.filter((income_df['_c2'] != 'GEO.display-label') & (income_df['_c2'] != 'Geography')) \ .select(income_df['_c2'].alias('county_state'), income_df['_c7'].alias('income')) county_state_column = pyspark.sql.functions.split( income_df['county_state'], ',') county_name = pyspark.sql.functions.split(county_state_column.getItem(0), ' ').getItem(0) state_name = functions.trim(county_state_column.getItem(1)) income_df = income_df.withColumn('county', pyspark.sql.functions.lower(county_name)) \ .withColumn('state', functions.lower(state_name)) \ .withColumn('income', income_df['income'].cast('int')) \ .drop('county_state') state_abbr_udf = functions.UserDefinedFunction( lambda state: states_abbr_mapping.get(state), types.StringType()) income_df = income_df.withColumn("state_abbr", state_abbr_udf( income_df["state"])).drop('state') income_df = income_df.withColumn('combine', functions.concat(income_df['county'], income_df['state_abbr'])) \ .filter((income_df['state_abbr'].isNotNull()) & (income_df['state_abbr'].isin(required_states_abbr))) return income_df
def generateRxInfo(self, dataFrame): logger.info('in generateRxInfo') base_uri = 'http://{}:{}/REST/rxcui/'.format( Constants.POSTGRESQL_HOST_IP, Constants.RXNAV_PORT) def genRxInfo(RxNormId): d = {'BN': [], 'IN': [], 'tag': set()} if RxNormId != '': try: resp = requests.get(url=base_uri + RxNormId + "/allrelatedextension") root = ET.fromstring(resp.content) for conceptGroup in root.findall( './allRelatedGroup/conceptGroup'): for tty in conceptGroup.findall('./tty'): if tty.text == 'BN': for brandname in conceptGroup.findall( './conceptProperties/name'): d['BN'].append(brandname.text) if tty.text == 'IN': for Ingredientname in conceptGroup.findall( './conceptProperties/name'): d['IN'].append(Ingredientname.text) for humandrug in conceptGroup.findall( './conceptProperties/inferedhuman'): if humandrug.text == 'US': d['tag'].add('Human_Drug') for animaldrug in conceptGroup.findall( './conceptProperties/inferedvet'): if animaldrug.text == 'US': d['tag'].add('Animal_Drug') except ET.ParseError as e: print("Invalid XML received from uri {}".format(e)) return str(d['BN']) + '|' + str(d['IN']) + '|' + ','.join(d['tag']) udf = Functions.UserDefinedFunction(genRxInfo, StringType()) localDf = dataFrame.groupBy( Functions.col("RxNormId").alias("RxNormId2")).agg( Functions.lit('1')) localDf = localDf.withColumn("RxInfo", udf(Functions.col("RxNormId2"))).drop('1') localDf = dataFrame.join( localDf, (dataFrame['RxNormId'] == localDf['RxNormId2'])).drop( localDf['RxNormId2']) split_col = Functions.split(localDf['RxInfo'], '\|') localDf = localDf.withColumn("Brand_Names", split_col.getItem(0))\ .withColumn("Ingredients", split_col.getItem(1)) \ .withColumn("tag",Functions.when(localDf.tag== '', split_col.getItem(2))\ .otherwise(Functions.when(Functions.length( split_col.getItem(2))==0,localDf.tag)\ .otherwise(Functions.concat(localDf.tag,Functions.lit(','),split_col.getItem(2))))).drop('RxInfo') #self.container1.stop() #self.container2.stop() return localDf
def main(): df = spark.read.json(katkam_in_directory) schema_file = open('schema') schema_lines = [i.strip() for i in schema_file.readlines()] schema = types.StructType([types.StructField(i, types.StringType(), False) for i in schema_lines]) schema_file.close() weather = spark.read.csv(weather_in_directory, schema=schema)#.withColumn('filename', functions.input_file_name()) df = df.join(weather, 'Date/Time') # https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe to_vec = functions.UserDefinedFunction(lambda vs: Vectors.dense(vs), VectorUDT()) get_rid_of_rain = functions.UserDefinedFunction(lambda vs: rain_gone(vs), types.LongType()) df = df.select(get_rid_of_rain(df['Weather']).alias('label'), to_vec(df['image']).alias('features')) # Do machine learning splits = df.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # Naive Bayes Model #nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # Logistic Regression Model lr = LogisticRegression() model = lr.fit(train) predictions = model.transform(test) # Compute accuracy on the test set evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) # Write the final predictions dataframe to a CSV directory predictions.write.json(out_directory, mode='overwrite') # Write the final accuracy score to a text file, tide analysis will write to the same file with open(out_directory + '/final-results.txt', 'w+') as fp: fp.write('Test set accuracy for weather analysis: ' + str(accuracy)) fp.close()
def convert_bigint_to_int(df): for col, t in df.dtypes: if t == 'bigint': df = df.withColumn(col, df[col].cast(IntegerType())) f_udf = F.UserDefinedFunction(lambda x: 1 if x is not None else 0, IntegerType()) df = df.withColumn("blog", f_udf(df.blog)) df = df.withColumn("company", f_udf(df.company)) df = df.withColumn("hireable", f_udf(df.hireable)) return df
def main(business_id_arg): concat_list = udf(lambda lst: ", ".join(lst), types.StringType()) reviews_df = spark.read.format("org.apache.spark.sql.cassandra") \ .options(table=TABLE_REVIEW, keyspace=KEY_SPACE) \ .load() review_filter = reviews_df.filter(reviews_df.business_id == business_id_arg) review_concatenate = review_filter.groupby('business_id').agg(collect_list('review').alias("review")) review_concatenate.show() train_fin = review_concatenate.withColumn("review", concat_list("review")) train_fin = train_fin.withColumn("review", functions.regexp_replace(train_fin.review, "[^0-9A-Za-z ,]", "")) # Create a new pipeline to create Tokenizer and Lemmatizer documentAssembler = DocumentAssembler().setInputCol("review").setOutputCol("document") tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") lemmatizer = Lemmatizer().setInputCols(["token"]).setOutputCol("lemma") \ .setDictionary("lemmas001.txt", key_delimiter=" ", value_delimiter="\t") pipeline = Pipeline(stages=[documentAssembler, tokenizer, lemmatizer]) pipelineFit = pipeline.fit(train_fin) train_df = pipelineFit.transform(train_fin) train_df.select('lemma').show(truncate=False) price_range_udf = functions.UserDefinedFunction(lambda attributes: get_attributes(attributes), types.StringType()) train_df = train_df.withColumn('lemma', price_range_udf(train_df['lemma'])) train_df = train_df.withColumn('lemma', functions.split(train_df['lemma'], ",").cast('array<string>')) # Create a new pipeline to remove the stop words test_review = train_df.select("lemma") stop_words_remover = StopWordsRemover(inputCol="lemma", outputCol="filtered") hash_tf = HashingTF(numFeatures=2 ** 16, inputCol="lemma", outputCol='tf') pipeline_too_remove_stop_words = Pipeline(stages=[hash_tf, stop_words_remover]) pipeline_fit = pipeline_too_remove_stop_words.fit(train_df) test_df = pipeline_fit.transform(test_review) test_df.show() token_array = test_df.select('filtered').rdd.flatMap(lambda row: row).collect() counts = Counter(token_array[0]) word_cloud = WordCloud( background_color='white', max_words=100, max_font_size=50, min_font_size=10, random_state=40 ).fit_words(counts) plt.imshow(word_cloud) plt.axis('off') # remove axis plt.show()
def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path): # get predictions validation_df = model.transform(test_df) metricsSchema = StructType() \ .add("metric", StringType()) \ .add("value", DoubleType()) metrics_names = [] # apply threshold def thresholdScore(x): retval = 0.0 if x > 0.5: retval = 1.0 return retval thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType()) validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) predTffm = validation_df_round.select(['label','rscore']) predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label)) metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm) metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR)) metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC)) mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm) metrics_names.append(("Precision",mmetricsTffm.precision())) metrics_names.append(("Recall",mmetricsTffm.recall())) metrics_names.append(("F1",mmetricsTffm.fMeasure())) metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall)) metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision)) metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure())) metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5))) metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate)) mRdd = sc.parallelize(metrics_names).coalesce(1) dfMetrics = spark.createDataFrame(mRdd, metricsSchema) dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite") endpointSchema = StructType() \ .add("time", StringType()) \ .add("endpoint", StringType()) endpoint_name = [] endpoint_name.append((str(time.time()),str(model.endpointName))) eRdd = sc.parallelize(endpoint_name).coalesce(1) dfEndpoint = spark.createDataFrame(eRdd, endpointSchema) dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")
def _flatten_obs(obs: pyspark_sql.DataFrame, code_system: str = None) -> pyspark_sql.DataFrame: """Creates a flat version of Observation FHIR resources. Note `code_system` is only applied on `code.coding` which is a required filed, i.e., it is not applied on `value.codeableConcept.coding`. Args: obs: A collection of Observation FHIR resources. code_system: The code system to be used for filtering `code.coding`. Returns: A DataFrame with the following columns (note one input observation might be repeated, once for each of its codes): - `coding` from the input obsservation's `code.coding` - `valueCoding` from the input's `value.codeableConcept.coding` - `value` from the input's `value` - `patientId` from the input's `subject.patientId` - `dateTime` from the input's `effective.dateTime` """ sys_str = ('coding.system="{}"'.format(code_system) if code_system else "coding.system IS NULL") value_sys_str_base = ('valueCoding.system="{}"'.format(code_system) if code_system else "valueCoding.system IS NULL") value_sys_str = "(valueCoding IS NULL OR {})".format( value_sys_str_base) merge_udf = F.UserDefinedFunction(_merge_date_and_value, T.StringType()) return (obs.withColumn( "coding", F.explode("code.coding")).where(sys_str).withColumn( "valueCoding", # Note valueCoding can be null. F.explode_outer("value.codeableConcept.coding"), ).where(value_sys_str).withColumn( "dateAndValue", merge_udf(F.col("effective.dateTime"), F.col("value.quantity.value")), ).withColumn( "dateAndValueCode", merge_udf(F.col("effective.dateTime"), F.col("valueCoding.code")), ).select( F.col("coding"), F.col("valueCoding"), F.col("value"), F.col("subject.patientId").alias("patientId"), F.col("effective.dateTime").alias("dateTime"), F.col("dateAndValue"), F.col("dateAndValueCode"), F.col("context.EncounterId").alias("encounterId"), ))
def main(): # do things... messages = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', '199.60.17.210:9092,199.60.17.193:9092') \ .option('subscribe', topic).load() line=messages.selectExpr('CAST (value as string)') my_udf_x=F.UserDefinedFunction(get_x, returnType=types.FloatType()) my_udf_y=F.UserDefinedFunction(get_y, returnType=types.FloatType()) line=line.withColumn('X',my_udf_x('value')) line=line.withColumn('Y',my_udf_y('value')) line=line.drop('value') line.createOrReplaceTempView('line') df=spark.sql('''select (sum(X*Y)-((1/count(*))*sum(X)*sum(Y))) /(sum(power(X,2))-((1/count(*))*power(sum(X),2))) as beta,(sum(Y)/count(*)) as lhs,((sum(X))/count(*)) as rhs from line ''') df.createOrReplaceTempView('final') df_final=spark.sql('''select beta,(lhs-(beta*rhs)) as alpha from final''') stream= df_final \ .writeStream \ .outputMode("complete")\ .format("console") \ .start() stream.awaitTermination(600)
def generateDrugNames(self,dataFrame,filename): logger.info('in generateDrugNames') keywordsDict = self.createDictFromKeywords(filename) split_col = Functions.split(dataFrame['Cleaned Item Description'],'(?<=^[^\\s]*)\\s') dataFrame = dataFrame.withColumn("Drug_Name", Functions.regexp_replace(split_col.getItem(0),'[\\,,\\.]','')) dataFrame = dataFrame.withColumn("Drug Strength", split_col.getItem(1)) def genDosageDict(text): tokens = text.split() dosage_fullname = '' dosage = '' tag = '' keywords = [] for token in tokens: token_fullname = keywordsDict.get(token,-1) if token_fullname != -1: keywords.append(token_fullname) if len(keywords) == 1: dosage_fullname = token_fullname dosage = token tag = 'Drug' keywords = set(keywords) if len(keywords) > 0: return '{}!{}!{}!{}'.format(dosage,dosage_fullname,tag,keywords) else: return '{}!{}!{}!{}'.format(dosage,dosage_fullname,tag,'') udf = Functions.UserDefinedFunction(lambda x: genDosageDict(x)) localDf = dataFrame.groupBy(Functions.col("Cleaned Item Description").alias('Cleaned Item Description2') ).agg(Functions.lit('1')) localDf = localDf.withColumn("dosage-fields", udf(Functions.col("Cleaned Item Description2"))).drop('1') split_col = Functions.split(localDf['dosage-fields'],'!') localDf = localDf.withColumn("dosage", split_col.getItem(0))\ .withColumn("dosage_fullname", split_col.getItem(1)) \ .withColumn("tag", split_col.getItem(2))\ .withColumn("keywords", split_col.getItem(3)).drop('dosage-fields') dataFrame = dataFrame.join(localDf,(dataFrame['Cleaned Item Description'] == localDf['Cleaned Item Description2'])).drop(localDf['Cleaned Item Description2']) dataFrame = dataFrame.select("Item_Description","Item Number ", "Manufacturer","Canonical_Manufacturer"\ ,"Drug_Name","tag","Key",Functions.col("IsDistinctKey").alias('Excluded')) return dataFrame
def generateSpendInfoBasedOnManufacturer(self, sparkContext, sparkSession, DataFrame, shouldConsiderOnlyTopScore, matchingThreshold): def getPriceINfo(Canonical_Manufacturer, Item_Description, shouldConsiderOnlyTopScore, matchingThreshold): db = Database() connection = db.getDbConn() cursor = connection.cursor() postgreSQL_select_Query = "select proddesc as prodDesc from uniqueProducts where lower(mfrcanonicalname)=\'{}\'".format( Canonical_Manufacturer) cursor.execute(postgreSQL_select_Query) records = cursor.fetchall() proddesc_list = [t[0] for t in records] result = self.common_process(proddesc_list, Item_Description, shouldConsiderOnlyTopScore, matchingThreshold, cursor, 'prodDesc') cursor.close() connection.close() return result localDf = DataFrame.groupBy( Functions.col("Canonical_Manufacturer").alias( 'Canonical_Manufacturer2'), Functions.col("Item_Description").alias('Item_Description2')).agg( Functions.lit('1')) udf = Functions.UserDefinedFunction(getPriceINfo, StringType()) localDf = localDf.withColumn('priceinfo',udf(Functions.col('Canonical_Manufacturer2'),Functions.col('Item_Description2'),\ Functions.lit(shouldConsiderOnlyTopScore),Functions.lit(matchingThreshold))).drop('1') DataFrame = DataFrame.join(localDf,(localDf.Canonical_Manufacturer2 == DataFrame.Canonical_Manufacturer) & \ (localDf.Item_Description2 == DataFrame.Item_Description)).drop(localDf.Item_Description2).drop(localDf.Canonical_Manufacturer2) DataFrame = self.common_process2(DataFrame) return DataFrame
def _flatten_obs(obs: DataFrame, code_system: str = None) -> DataFrame: """Creates a flat version of Observation FHIR resources. Note `code_system` is only applied on `code.coding` which is a required filed, i.e., it is not applied on `value.codeableConcept.coding`. Args: obs: A collection of Observation FHIR resources. code_system: The code system to be used for filtering `code.coding`. Returns: A DataFrame with the following columns (note one input observation might be repeated, once for each of its codes): - `coding` from the input obsservation's `code.coding` - `valueCoding` from the input's `value.codeableConcept.coding` - `value` from the input's `value` - `patientId` from the input's `subject.patientId` - `dateTime` from the input's `effective.dateTime` """ sys_str = 'coding.system="{}"'.format( code_system) if code_system else 'coding.system IS NULL' value_sys_str_base = 'valueCoding.system="{}"'.format( code_system) if code_system else 'valueCoding.system IS NULL' value_sys_str = '(valueCoding IS NULL OR {})'.format( value_sys_str_base) merge_udf = F.UserDefinedFunction( lambda d, v: merge_date_and_value(d, v), T.StringType()) return obs.withColumn( 'coding', F.explode('code.coding')).where(sys_str).withColumn( 'valueCoding', # Note valueCoding can be null. F.explode_outer('value.codeableConcept.coding') ).where(value_sys_str).withColumn( 'dateAndValue', merge_udf( F.col('effective.dateTime'), F.col('value.quantity.value'))).withColumn( 'dateAndValueCode', merge_udf(F.col('effective.dateTime'), F.col('valueCoding.code')) ).select(F.col('coding'), F.col('valueCoding'), F.col('value'), F.col('subject.patientId').alias('patientId'), F.col('effective.dateTime').alias('dateTime'), F.col('dateAndValue'), F.col('dateAndValueCode'), F.col('context.EncounterId').alias('encounterId'))
def main(): df = spark.read.json(katkam_in_directory) schema = types.StructType([ types.StructField('Date/Time', types.StringType(), False), types.StructField('label', types.LongType(), False) ]) tides = spark.read.json(tide_in_directory, schema=schema) tides.show() df = df.join(tides, 'Date/Time') df.show() # https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe to_vec = functions.UserDefinedFunction(lambda vs: Vectors.dense(vs), VectorUDT()) df = df.select(df['label'], to_vec(df['features']).alias('features')) df.show() # Do machine learning splits = df.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # Logistic Regression Model lr = LogisticRegression() models = [lr] model = [i.fit(train) for i in models] predictions = [i.transform(test) for i in model] [i.show() for i in predictions] # compute accuracy on the test set evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = [evaluator.evaluate(i) for i in predictions] # Write the final predictions dataframe to a CSV directory predictions.write.json(out_directory, mode='overwrite') # Write the final accuracy score to a text file, tide analysis will write to the same file with open(out_directory + '/final-results.txt', 'w+') as fp: fp.write('Test set accuracy for tides analysis: ' + str(accuracy)) fp.close()
def add_date_info_spark(df, convert=True): '''Take a spark df and add date info ''' if convert: f_datestring = F.UserDefinedFunction(lambda x: x[:-4] + '+00:00', StringType()) df = df.withColumn("first_event", f_datestring(df.first_event)) df = df.withColumn("last_event", f_datestring(df.last_event)) df = df.withColumn("first_event", F.to_timestamp(df.first_event)) df = df.withColumn("last_event", F.to_timestamp(df.last_event)) df = df.withColumn("created_at", F.to_timestamp(df.created_at)) df = df.withColumn("updated_at", F.to_timestamp(df.updated_at)) df = df.withColumn("recency", F.datediff(df.last_event, df.created_at)) df = df.withColumn("time_between_first_last_event", F.datediff(df.last_event, df.first_event)) return df
def generateNewSpendInfoBasedOnDescription(self, sparkContext, sparkSession, DataFrame, shouldConsiderOnlyTopScore, matchingThreshold): def getPriceINfo(Item_Description, shouldConsiderOnlyTopScore, matchingThreshold): db = Database() connection = db.getDbConn() cursor = connection.cursor() postgreSQL_select_Query = "select itemdesc from priceinfo9july2019 where lower(itemdesc) like \'{}\'".format( Item_Description) cursor.execute(postgreSQL_select_Query) records = cursor.fetchall() proddesc_list = [t[0] for t in records] result = self.common_process(proddesc_list, Item_Description, shouldConsiderOnlyTopScore, matchingThreshold, cursor, 'itemDesc') cursor.close() connection.close() return result udf = Functions.UserDefinedFunction(getPriceINfo, StringType()) localDf = DataFrame.groupBy( Functions.col("Item_Description").alias('Item_Description2')).agg( Functions.lit('1')) localDf = localDf.withColumn('priceinfo',udf(Functions.col('Item_Description2'),\ Functions.lit(shouldConsiderOnlyTopScore),Functions.lit(matchingThreshold))).drop('1') DataFrame = DataFrame.join(localDf,\ localDf.Item_Description2 == DataFrame.Item_Description).drop(localDf.Item_Description2) DataFrame = self.common_process3(DataFrame) return DataFrame
def main(): # do things... input_file = inputs schemas = StructType([StructField('Language', types.StringType(), False),StructField('Page', types.StringType(), False)\ ,StructField('Requests', types.IntegerType(), False)\ ,StructField('Size', types.IntegerType(), False)]) comment = spark.read.csv(input_file, sep=" ", schema=schemas) comment = comment.withColumn('Filename', functions.input_file_name()) my_udf = F.UserDefinedFunction(path_to_hour, returnType=types.StringType()) comment = comment.withColumn('Split', my_udf('Filename')) comment = comment.drop('Filename').withColumnRenamed('Split', 'Filename') comment = comment.filter((comment.Language == 'en') & (comment.Page != 'Main_page') & (~(comment.Page.like('Special:%')))) max_df = comment.groupBy('Filename').agg({'Requests': 'max'}) max_df = max_df.withColumnRenamed('max(Requests)', 'Requests') final_df = max_df.join(comment, ['Filename', 'Requests']) final_df = final_df.select('Filename', 'Page', 'Requests').orderBy('Filename') final_df.write.csv(output, sep=',')
def generateSpendInfoBasedOnUnspsc(self, sparkContext, sparkSession, DataFrame, shouldConsiderOnlyTopScore, matchingThreshold): def getPriceINfo(Unspsc_Code, Item_Description, shouldConsiderOnlyTopScore, matchingThreshold): db = Database() connection = db.getDbConn() cursor = connection.cursor() postgreSQL_select_Query = "select prodcleandesc,id from uniqueProducts_UNSPSC where unspscclasscode={}".format( Unspsc_Code) cursor.execute(postgreSQL_select_Query) records = cursor.fetchall() result = self.common_process(records, Item_Description, shouldConsiderOnlyTopScore, matchingThreshold, cursor, 'prodDesc') cursor.close() connection.close() return result localDf = DataFrame.groupBy( Functions.col("Unspsc_Code").alias('Unspsc_Code2'), Functions.col("Item_Description").alias('Item_Description2')).agg( Functions.lit('1')) udf = Functions.UserDefinedFunction(getPriceINfo, StringType()) localDf = localDf.withColumn('priceinfo',udf(Functions.col('Unspsc_Code2'),Functions.col('Item_Description2'),\ Functions.lit(shouldConsiderOnlyTopScore),Functions.lit(matchingThreshold))).drop('1') DataFrame = DataFrame.join(localDf,(localDf.Unspsc_Code2 == DataFrame.Unspsc_Code) & \ (localDf.Item_Description2 == DataFrame.Item_Description)).drop(localDf.Item_Description2).drop(localDf.Unspsc_Code2) DataFrame = self.common_process2(DataFrame) return DataFrame
def createIMMClassifierInput(self, dataFrame): dataFrame = self.generatePIM(dataFrame) udf = Functions.UserDefinedFunction(lambda x: str(uuid.uuid1())) localDf = dataFrame.filter(dataFrame.Excluded == '0') localDf = localDf.withColumn('Part Number', udf(Functions.lit(''))) localDf = localDf.select(Functions.col('PIM').alias('PIM_KEY')\ ,Functions.col('Canonical_Manufacturer').alias('COMPANY_NAME')\ ,Functions.col('Part Number').alias('PART_NUMBER')\ ,Functions.col('Item_Description').alias('PRODUCT_DESCRIPTION')\ ,Functions.lit('').alias('UNSPSC')) data_abs_path = os.path.abspath( Constants.IMM_CLASSIFIER_INPUT_FILE_LOCATION) localDf.toPandas().to_csv('{}/input.csv'.format(data_abs_path), index=False) logger.info("IMM classifier input.csv file placed in {} ".format( data_abs_path)) return dataFrame
def process_business_json(input_json_business): business_schema = types.StructType([ types.StructField('business_id', types.StringType(), True), types.StructField('city', types.StringType(), True), types.StructField('state', types.StringType(), True), types.StructField('stars', types.DoubleType(), True), types.StructField('review_count', types.LongType(), True), types.StructField('categories', types.StringType(), True), types.StructField('postal_code', types.StringType(), True), types.StructField('latitude', types.DoubleType(), True), types.StructField('longitude', types.DoubleType(), True), types.StructField( 'attributes', types.StructType([ types.StructField("RestaurantsPriceRange2", types.StringType(), True) ]), False), types.StructField('is_open', types.IntegerType(), True), types.StructField('name', types.StringType(), True) ]) # Read business JSON file using the schema business_df = spark.read.json(input_json_business, schema=business_schema) price_range_udf = functions.UserDefinedFunction( lambda attributes: get_price_range(attributes), types.IntegerType()) # Filter all the businesses which are still open in the business_states business_df = business_df.filter((business_df['is_open'] == 1) & business_df['state'].isin(business_states) & business_df['categories'].like(filter_categories)) \ .withColumn('pricerange', price_range_udf(business_df['attributes'])) \ .withColumnRenamed('business_id', 'b_id') \ .withColumnRenamed('stars', 'b_stars') \ .select('b_id', 'name', 'city', 'state', 'review_count', 'categories', 'postal_code', 'latitude', 'longitude', 'pricerange', 'b_stars') \ .repartition(100) write_to_cassandra(business_df, TABLE_BUSINESS) return business_df
def generateRxNormId(self,dataFrame): logger.info('in generateRxNormId') base_uri = 'http://{}:{}/process/rxnorm?q='.format(Constants.POSTGRESQL_HOST_IP,Constants.CTAKES_PORT) def genRxNormId(drugName): resp = requests.get(url=base_uri+drugName) data = resp.json() if len(data)>0: return data[0]['code'] else: return '' udf = Functions.UserDefinedFunction(genRxNormId, StringType()) localDf = dataFrame.groupBy(Functions.col("Drug_Name").alias('Drug_Name2')).agg(Functions.lit('1')) localDf = localDf.withColumn("RxNormId",udf(Functions.col("Drug_Name2"))).drop('1') localDf = dataFrame.join(localDf,(dataFrame['Drug_Name'] == localDf['Drug_Name2'])).drop(localDf['Drug_Name2']) localDf = localDf.withColumn("tag", Functions.when((localDf.tag == "") & (localDf.RxNormId != "") , "Drug").otherwise(localDf.tag)) return localDf
def generatePriorityDrugsInfo(self, sparkSession, dataFrame, fileName, matchingThreshold, matching_type): logger.info('in generatePriorityDrugsInfo ') priorityDF = self.getdata(sparkSession, fileName) if matching_type == 'FUZZY': cleandesc_list = priorityDF['Clean Description'].unique().tolist() def genpriorityfields(Item_description): bestMatch = process.extractOne(Item_description, cleandesc_list, scorer=fuzz.token_sort_ratio, score_cutoff=matchingThreshold) if bestMatch is not None: priority_desc = priorityDF.loc[ priorityDF['Clean Description'] == bestMatch[0], 'Item Description'].iloc[0] return '{}!{}!{}'.format('true', priority_desc, bestMatch[1]) else: return '{}!{}!{}'.format('false', '', '') else: cleandesc_list = priorityDF['processed clean desc'].unique( ).tolist() lav = MatchString() def genpriorityfields(Item_description): bestMatch = lav.extractOneWithcutoff( Item_description, cleandesc_list, score_cutoff=matchingThreshold) if bestMatch is not None: priority_desc = priorityDF.loc[ priorityDF['processed clean desc'] == bestMatch[0], 'Item Description'].iloc[0] return '{}!{}!{}'.format('true', priority_desc, bestMatch[1]) else: return '{}!{}!{}'.format('false', '', '') localDf = dataFrame.groupBy( Functions.col("Item_Description").alias('Item_Description2')).agg( Functions.lit('1')) udf = Functions.UserDefinedFunction(lambda x: genpriorityfields(x)) localDf = localDf.withColumn( "priority-fields", udf(Functions.col("Item_Description2"))).drop('1') localDf = dataFrame.join(localDf, (dataFrame['Item_Description'] == localDf['Item_Description2'])).drop( localDf['Item_Description2']) split_col = Functions.split(localDf['priority-fields'], '!') localDf = localDf.withColumn("Priority Drug Match Flag", split_col.getItem(0))\ .withColumn("Priority Drug Accuracy", split_col.getItem(2)) \ .withColumn("Priority Drug Desc", split_col.getItem(1)).drop('priority-fields') localDf = localDf.withColumn( "tag", Functions.when( (localDf.tag == "") & (Functions.col('Priority Drug Match Flag') == 'true'), "Drug").otherwise(localDf.tag)) return localDf
'infantil' in txt or \ 'niños' in txt or \ 'niñas' in txt or \ 'menino' in txt or \ 'menina' in txt: return 'infantil' if 'unisex' in txt or 'unissex' in txt: return 'unisex' if 'masculino' in txt: return 'masculino' if 'femenino' in txt or 'feminino' in txt: return 'feminino' return txt gender_fix_udf = F.UserDefinedFunction(fix_gender, T.StringType()) flat = flat.withColumn('gender_fixed', gender_fix_udf('gender')) # convert Order_date from int to datetime # verify with flat.groupBy('gender_fixed').count().sort('count').show(100) flat = flat.withColumn('average_cost_per_item', flat.product_net_cost / flat.item_sold) flat = flat.withColumn('absolute_margin', flat.product_net_cost - flat.product_net_revenue) flat = flat.withColumn('percentage_margin', flat.absolute_margin / flat.product_net_cost) flat = flat.withColumn('average_price_per_item', flat.product_net_revenue / flat.item_sold)
def main(argv): inputfile = False outputfile = False jumbledwordsfile = False try: opts, args = getopt.getopt(argv, "i:o:j:", ["ifile=", "ofile=", "jumbletestfile="]) except getopt.GetoptError: print( 'spark-submit resolvejumbledword.py -i <dictionary inputfile> -o <dictionary outputfile> -j <jumbled words list file>' ) sys.exit(2) for opt, arg in opts: if opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-j", "--jumbletestfile"): jumbledwordsfile = arg if not inputfile or not outputfile or not jumbledwordsfile: print( 'spark-submit resolvejumbledword.py -i <dictionary inputfile> -o <dictionary outputfile> -j <jumbled words list file>' ) sys.exit(2) print inputfile print outputfile print jumbledwordsfile reformatjson(inputfile, outputfile) conf = SparkConf().setAppName("Spark jumble resolution") sc = SparkContext(conf=conf) #sc = pyspark.SparkContext('Reyane') sqlContext = SQLContext(sc) df = sqlContext.read.option('multinode', True).json(outputfile) # Displays the content of the DataFrame to stdout print('printing the dataframe as it is from the json file loaded ') df.show() word_udf = F.UserDefinedFunction(sort_string_chars, T.StringType()) df = df.withColumn("sortedChartWORD", word_udf('word')) rank_udf = F.UserDefinedFunction(modify_rank_of_zero, T.IntegerType()) df = df.withColumn("modifiedRank", rank_udf('rank')) print('printing the dataframe with a newly add column') df.show() sqlContext.registerDataFrameAsTable(df, "table1") lines = readjumbledwordfile(jumbledwordsfile) sqlstatment = "SELECT * FROM table1 WHERE sortedChartWORD='" + sort_string_chars( lines[0]) + "'" firstline = lines[0] lines.remove(lines[0]) for x in lines: sqlstatment = sqlstatment + " OR sortedChartWORD='" + sort_string_chars( x) + "'" print(sqlstatment) #lines2=lines.append(firstline) #length=len(lines) lines = lines + [firstline] df2 = sqlContext.sql(sqlstatment) #df2=sqlContext.sql("SELECT * FROM table1 WHERE sortedChartWORD='adgln' OR sortedChartWORD='ajmor' OR sortedChartWORD='abcelm' OR sortedChartWORD='aelrwy'") df2 = df2.toPandas() print("Printing the pandas dataframe") print(df2) print('') print("********** Printing the results **********") for x in lines: df3 = df2[df2['sortedChartWORD'] == sort_string_chars(x)] print('') print('Printing the best matched word for ' + x + ':') if df3['word'].count() == 0: print("No match") else: print( df3.sort_values(by=['modifiedRank'], ascending=True).iloc[0]['word']) print("********** End of the results **********")
df_0 = spark.read.text("Data/NASA_access_log_Jul95.gz").cache() # Question 1A # exploratory analysis showed that there is one record not from Jul/1995 # so we can filter data at first df = df_0.filter(df_0.value.contains("/Jul/1995")).cache() #Extract date and time regExpDate = '\d{2}\/Jul/1995:\d{2}:\d{2}:\d{2}' df = df.withColumn('date_time', F.regexp_extract('value', regExpDate, 0)) # df.show(5) # create new columns - transform into Timestamp udf_new_date = F.UserDefinedFunction( lambda x: datetime.strptime(x, '%d/%b/%Y:%H:%M:%S'), TimestampType()) df = df.withColumn('new_date', udf_new_date('date_time')) df.printSchema() df = df.withColumn('day', F.dayofmonth('new_date')) df = df.withColumn('hour', F.hour('new_date')) # categorise hour intervals udf_categor = F.UserDefinedFunction(lambda x: '1' if x < 4 else ( "2" if x < 8 else ("3" if x < 12 else ("4" if x < 16 else ("5" if x < 20 else "6"))))) df = df.withColumn('hour_categor', udf_categor('hour')) grouped_categor_hour = df.groupby('hour_categor').agg( F.count("value")).alias("count").orderBy("hour_categor").cache()
.appName("Train Naive Bayes Model ").getOrCreate() # Create sql Context so that we can query data files in sql like syntax sqlContext = SQLContext(spark) spark.sparkContext.setLogLevel("ERROR") # Read in training data (from Yelp) and pre-process it df = sqlContext.read.json(sys.argv[1]) print ("Loaded " + str(df.count()) + " records with the following schema " + str(df.schema.names)) # print header names df = df.select("business_id", "text", "stars") #select the fields we need df = df.where(col("text").isNotNull()) #Remove reviews that have no text df.show(2) # Since its not labeled with sentiment, we will assume that a 4 or above is positive my_udf = F.UserDefinedFunction(generateLabel, IntegerType()) data = df.withColumn("label", my_udf("stars")) # generate a new column called label based on function data.show(2) # Remove punctuation and convert to lowercase punc = udf(lambda x: re.sub(r'[^\w\s\d-]', '', x)) newline = udf(lambda y: re.sub('[\n\t]',' ',y)) data.createOrReplaceTempView("data") data = spark.table('data').select("business_id", lower(col("text")).alias("text"), "label", "stars") data = data.withColumn("text", punc("text")) data = data.withColumn("text", newline("text")) data.show(2) # Generate tokens tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(data)
return date def create_cohort(date): rv = -1 if date is not None: year = date.strftime("%y") week = "%02d" % int(date.strftime("%W")) rv = int("{0}{1}".format(year, week)) return rv clean_string_for_int_udf = Functions.UserDefinedFunction( clean_string_for_int, IntegerType()) clean_string_for_date_udf = Functions.UserDefinedFunction( clean_string_for_date, DateType()) create_cohort_udf = Functions.UserDefinedFunction(create_cohort, IntegerType()) schema = StructType([ \ StructField("path", StringType()), \ StructField("creator", StringType()), \ StructField("remover", StringType()), \ StructField("created", StringType()), \ StructField("removed", StringType()), \ StructField("lifespan", StringType()), \ ]) line_decay_input_df = spark.read.csv( \
table, split_size=split_size, row_format=pyspark_cassandra.RowFormat.DICT).setName(table) return rdd def df_for(keyspace, table, split_size=None): sqlContext = SQLContext(sc) df = sqlContext.createDataFrame( sc.cassandraTable(keyspace, table, split_size=split_size).setName(table)) df.createOrReplaceTempView(table) return df toStringList = functions.UserDefinedFunction( lambda names: names.split(' '), types.ArrayType(types.StringType())) def row_to_dict(row): return row.asDict() def save_rdd_to_cassandra(rdd): rdd.saveToCassandra(output_keyspace, 'orders_parts', consistency_level=ConsistencyLevel.ONE, batch_size=100) def agg_orders(order_rdd): order_rdd.cache()
if __name__ == "__main__": # Create Spark Instance spark = SparkSession \ .builder \ .appName("SimpleCrimeCSVParser") \ .getOrCreate() # Location of CSV File filePath = "/Users/terrencebunkley/dev/git/data-eng-bootcamp/data/crimes/Crimes_-_One_year_prior_to_present.csv" def get_DD_MM_YYYY(col): return col.split(' ')[0] getDate = F.UserDefinedFunction(get_DD_MM_YYYY, T.StringType()) # Load DataFrame From the CSV df = spark.read.load(filePath, format="csv", sep=",", inferSchema="true", header="true").cache() print("The column types are %s" % df.dtypes) print("The columns we see in this doc are %s" % df.columns) print("The rows count %i" % df.count()) print("Describe function") df.describe().show() print("Summary function") df.summary().show() # Show Arrests print("The number of arrests we see in this report are %i" % df.filter(df["ARREST"] == "Y").count())
BRP_model = BucketedRandomProjectionLSHModel.load("s3a://lsh-model/BRP_model.model") lsh = BRP_model.transform(df) lsh = lsh.drop('features') # Transform the hash value into string format for storage def to_str(x): y = '' for i in x: y += str(i[0]) y += ';' y = y[:-1] return y my_udf = F.UserDefinedFunction(to_str, T.StringType()) lsh = lsh.withColumn('hash', my_udf('hashes')) lsh = lsh.drop('hashes').withColumnRenamed('hash', 'hashes') # Combine distributed Spark DataFrame to a Pandas DataFrame def _map_to_pandas(rdds): return [pd.DataFrame(list(rdds))] def topas(df, n_partitions=None): if n_partitions is not None: df = df.repartition(n_partitions) df_pand = df.rdd.mapPartitions(_map_to_pandas).collect() df_pand = pd.concat(df_pand) df_pand.columns = df.columns return df_pand lsh_pandas = topas(lsh)