from src.models.db_config import DbConfig
import src.lib.db_utils as DbUtils

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()
    sqlContext = SQLContext(spark)

    db_config = DbConfig(db_name="healthcare2", db_table="public.neighborhood")

    s3_input_file_location = "s3a://data2insights/zipcode/parquet/scored_neighborhood_master"

    cols = [
        F.col("zip_code").alias("zip_code"),
        F.col("state").alias("state"),
        F.col("latitude").alias("latitude"),
        F.col("longitude").alias("longitude"),
        F.col("county_fips").alias("county_fips"),
        F.col("county_name").alias("county_name"),
        F.col("provider_count").alias("provider_count"),
        F.col("institution_count").alias("institution_count"),
        F.col("medicare_count").alias("medicare_count"),
        F.col("score").alias("score")
    ]

    scored_neighborhood_df = sqlContext.read.parquet(
        s3_input_file_location).select(cols)
    DbUtils.insert(db_config=db_config, dataframe=scored_neighborhood_df)

    spark.stop()
示例#2
0
if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()
    sqlContext = SQLContext(spark)

    db_config = DbConfig(db_name="healthcare2", db_table="public.provider")

    s3_input_file_location = "s3a://data2insights/provider/parquet/scored_provider_master"

    cols = [
        F.col("npi").alias("npi"),
        F.col("last_name").alias("last_name"),
        F.col("first_name").alias("first_name"),
        F.col("middle_name").alias("middle_name"),
        F.col("suffix").alias("suffix"),
        F.col("credentials").alias("credentials"),
        F.col("gender").alias("gender"),
        F.col("specialty").alias("specialty"),
        F.col("medicare_count").alias("medicare_count"),
        F.col("score").alias("score"),
        F.col("zip_code").alias("zip_code")
    ]

    scored_hcp_df = sqlContext.read.parquet(s3_input_file_location).select(
        cols)

    DbUtils.insert(db_config=db_config, dataframe=scored_hcp_df)

    spark.stop()
示例#3
0
from pyspark.sql import SparkSession, SQLContext
from src.models.db_config import DbConfig
import src.lib.db_utils as DbUtils

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()
    sqlContext = SQLContext(spark)

    db_config = DbConfig(db_name="healthcare", db_table="public.npi_hco")

    s3_input_file_location = (
        "s3a://data2insights/NPPES/parquet/npi_hco"
    )

    limit = 10000000
    cols = ["npi", "entity_type_code", "provider_organization_name_legal_business_name", "provider_first_line_business_mailing_address" , "provider_second_line_business_mailing_address", "provider_business_mailing_address_city_name", "provider_business_mailing_address_state_name", "zip5"]
    df = sqlContext.read.parquet(s3_input_file_location).select(cols).limit(limit)
    DbUtils.insert(db_config=db_config, dataframe=df)

    spark.stop()
示例#4
0
from pyspark.sql import SparkSession, SQLContext
from src.models.db_config import DbConfig
import src.lib.db_utils as DbUtils
import pyspark.sql.functions as F

if __name__ == "__main__":

    spark = SparkSession.builder.appName("data2insights").getOrCreate()
    sqlContext = SQLContext(spark)

    db_config = DbConfig(db_name="healthcare2", db_table="public.institution")

    s3_input_file_location = "s3a://data2insights/institution/parquet/scored_institution_master"

    cols = [
        F.col("npi").alias("npi"),
        F.col("name").alias("name"),
        F.col("medicare_count").alias("medicare_count"),
        F.col("score").alias("score"),
        F.col("zip_code").alias("zip_code")
    ]

    scored_institution_df = sqlContext.read.parquet(
        s3_input_file_location).select(cols)

    DbUtils.insert(db_config=db_config, dataframe=scored_institution_df)

    spark.stop()