def home(): predictionLabels = [] labels = [] newModel = RandomForestClassificationModel.read().load( "./app/static/rf-model") test_df = spark.read.parquet("./app/static/test/*.parquet") predictions = newModel.transform(test_df) convertedDf = labelConverter.transform(predictions) print(convertedDf.printSchema()) predictionLabel = convertedDf.select("predictionLabel").limit(5).collect() label = convertedDf.select("primary_type").limit(5).collect() for i in range(len(predictionLabel)): predictionLabels.append(predictionLabel[i]["predictionLabel"]) for i in range(len(label)): labels.append(label[i]["primary_type"]) length = len(labels) return render_template('home.html', labels=labels, predictionLabels=predictionLabels)
def main(account_name, account_key): sc = SparkContext() sqlContext = SQLContext(sc) patient_records_container = 'patientrecords' glucose_levels_container = 'glucoselevelsaggs' preds_container = 'predictions' blob_service = BlobService(account_name=account_name, account_key=account_key) blob_service.create_container(preds_container) day_to_predict = get_most_recent_date(blob_service, glucose_levels_container) df = get_df_from_blob(blob_service, glucose_levels_container, patient_records_container, day_to_predict) project_path = 'wasb://model@{}.blob.core.windows.net/{}' si_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'si_pipe_model')) oh_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'oh_pipe_model')) model = RandomForestClassificationModel.read().load(path=project_path.format(account_name, 'model')) df_spark = sqlContext.createDataFrame(df) df_preds = si_pipe_model.transform(df_spark) df_preds = oh_pipe_model.transform(df_preds) num_var_names = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'glucose_min', 'glucose_max', 'glucose_mean', 'glucose_var'] cat_var_names = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'diag_1_missing', 'diag_2_missing', 'diag_3_missing', 'race_missing', 'weight_missing', 'payer_code_missing', 'medical_specialty_missing'] va = VectorAssembler(inputCols=(num_var_names + [c + "__encoded__" for c in cat_var_names]), outputCol='features') df_preds = va.transform(df_preds).select('features') df_preds = model.transform(df_preds) df_preds_pandas = df_preds.toPandas() df_preds_pandas = pd.concat([df[['patient_nbr', 'discharge_date']], df_preds_pandas['probability'].map(lambda x: x[1])], axis=1) # Save the predictions blob_service.put_block_blob_from_text(blob_name='-'.join(str(day_to_predict).split('/')) + '.csv', container_name=preds_container, text=df_preds_pandas.to_csv(index=False)) return