for i in a.keys():
        b = a[i]
        if "{" in str(b):
            key_list = key_list + [i]
    return (key_list)


ignore_list_1 = [
    'entitylogicalname', 'SinkCreatedOn', 'SinkModifiedOn', 'messageid',
    'sourcesystem', 'Id', 'entitydata'
]
ignore_list = [col.lower() for col in ignore_list_1]

sqlContext.sql("use dsc60263_fsm_tz_db")

for m in sqlContext.tables("dsc60263_fsm_tz_db").select('tableName').where(
        "tableName not like '%dup%'").where(
            "tableName not like '%bkp%'").where(
                "tableName not like '%temptz%'").collect():
    k = m.asDict().values()[0].encode('utf-8')
    v1 = sqlContext.table(k).filter(
        "to_date(lastupdatedatetime)  = date_sub(CAST(current_timestamp() as DATE), 1)"
    )
    y1 = v1.select("message__id").distinct()
    if y1.count() > 0:
        v2 = list(v1.select("message__id").toPandas()["message__id"])
        for msg_id in v2:
            print(str(msg_id).strip() + "\\n")

sys.exit(0)
Пример #2
0
from pyspark import SparkConf,SparkContext
from pyspark import HiveContext
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as func


hive_context = HiveContext(sc)
# load data
procedure = hive_context.table("default.procedure")
hospitals = hive_context.table("default.hospital")
measures = hive_context.table("default.measure")
surveys = hive_context.table("default.survey")
# cast types
procedure_typecast = procedure.withColumn("score", procedure["score"].cast(DoubleType())).withColumn("sample",procedure["sample"].cast(IntegerType())).withColumn("denominator",procedure["denominator"].cast(IntegerType()))
surveys_typecast = surveys.withColumn("hcahps_base",surveys["hcahps_base"].cast(IntegerType())).withColumn("hcahps_consistency",surveys["hcahps_consistency"].cast(IntegerType()))
surveys_selected = surveys_typecast.select('provider_id','hcahps_base','hcahps_consistency')
procedures_selected = procedure_typecast.select('provider_id','sample','score','measure_id')
# calculate hospital average scores
hospital_avg = procedure_typecast.where((procedure_typecast['score']<100)&(procedure_typecast['sample']>50)).groupby('provider_id').agg(func.avg('score'))
hosp_avg_surv =hospital_avg.join(surveys_selected,surveys_selected.provider_id==hospital_avg.provider_id)
# calculate correlation of hospital average score with HCAHPS base score
hosp_avg_surv.stat.corr('avg(score)','hcahps_base')
# -0.09947309348326736        
   hosp_avg_surv.stat.corr('avg(score)','hcahps_consistency')
# 0.03405962378975164 
measure_avg = procedure_typecast.where((procedure_typecast['score']<100)&(procedure_typecast['sample']>50)).groupby('measure_id').agg(func.avg('score'))
procedures_surveys = procedures_selected.join(procedures_selected,surveys_selected.provider_id==procedures_selected.provider_id)
measure_avg_surv = measure_avg.join(procedures_surveys,procedures_surveys.measure_id==measure_avg.measure_id)
measure_avg_surv.stat.corr('avg(score)','hcahps_base')
#-0.0005003159809636114   
Пример #3
0
from pyspark import SparkConf, SparkContext
from pyspark import HiveContext
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as func
from pyspark.sql.functions import col

hive_context = HiveContext(sc)
procedure = hive_context.table("default.procedure")
hospitals = hive_context.table("default.hospital")
procedure_typecast = procedure.withColumn(
    "score", procedure["score"].cast(DoubleType())).withColumn(
        "sample", procedure["sample"].cast(IntegerType())).withColumn(
            "denominator", procedure["denominator"].cast(IntegerType()))
procedure_hospital = procedure_typecast.join(
    hospital, procedure_typecast.provider_id == hospital.provider_id)
# subset for those procedures that have a score not higher than 100 and a sample of at least 50
score_avg = procedure_hospital.where((procedure_hospital['score'] <= 100) & (
    procedure_hospital['sample'] > 50)).groupby('state').agg(func.avg('score'))
# show the 10 best states
best_states = score_avg.sort(score_avg['avg(score)'].desc()).show(10)
ignore_list = [
    'entitylogicalname', 'SinkCreatedOn', 'SinkModifiedOn', 'messageid',
    'sourcesystem', 'Id'
]

g1 = 0
g2 = 0
sqlContext.sql("use dsc60230_gcct_tz_db")

for m in sqlContext.tables("dsc60230_gcct_tz_db").select('tableName').where(
        "tableName not like '%bad%'").where(
            "tableName not like '%dup%'").where(
                "tableName not like '%bkp%'").where(
                    "tableName not like '%temptz%'").collect():
    k = m.asDict().values()[0].encode('utf-8')
    v1 = sqlCtx.table(k).where(
        "to_date(lastupdatedatetime)  = date_sub(current_date, 1)")
    g1_1 = v1.count()
    g1 = g1 + g1_1
    v3 = {}
    v2 = v1.rdd.map(lambda x: str(x).replace("Row(", "").replace(
        "u\'", "").replace("\'", "").replace("\\\\", "").replace("\"", "")
                    ).map(lambda z: re.findall(r',([a-zA-Z]+):{', z)).filter(
                        lambda x: len(x) > 0).collect()
    v3 = set([item for sublist in v2 for item in sublist])
    if len(v3) > 0:
        print "\\nThe table " + str(k) + " seem to have nested data " + str(
            v3) + "\\n\\n"
    y1 = v1.groupby("shakey").count().filter("count >1")
    if y1.count() > 0:
        y2 = y1.select("shakey").toPandas().to_string().replace("\n", ";")
        print("\\nTable: " + str(k) + " has " + str(y1.count()) +
from pyspark import SparkConf, SparkContext
from pyspark import HiveContext
from pyspark.sql.functions import monotonicallyIncreasingId
from pyspark.sql.functions import lit
from pyspark.sql.functions import StringType

# instantiate SparkContext for spark-submit
conf = SparkConf().setAppName("transforming effective care and readmissions")
sc = SparkContext(conf=conf)
# read Hive tables
hive_context = HiveContext(sc)
measure_effective_care = hive_context.table("default.measure_effective_care")
# filter
measure_effective_care.registerTempTable("effective_temp")
measure_effective_selected = hive_context.sql(
    " SELECT provider_id, measure_id,score,sample,measure_start,measure_end FROM effective_temp"
)
# add a column to flag the data as effective care
measure_effective_with_care_type = measure_effective_selected.withColumn(
    'care_type',
    lit("effective").cast(StringType()))
# add an empty column for readmission denominators
measure_effective_with_care_type_denominator = measure_effective_with_care_type.withColumn(
    'denominator',
    lit(None).cast(StringType()))

# read in readmission data
measure_readmission = hive_context.table("default.measure_readmission")
measure_readmission.registerTempTable("readmission_temp")
measure_readmission_selected = hive_context.sql(
    " SELECT provider_id, measure_id,denominator,score,measure_start,measure_end FROM readmission_temp"
    'msdyn_surveyresponse', 'optionsetmetadata', 'phonecall', 'queue',
    'queueitem', 'sla', 'slaitem', 'slakpiinstance', 'socialactivity',
    'socialprofile', 'statemetadata', 'statusmetadata', 'systemuser', 'task',
    'team', 'teammembership', 'territory', 'gcct_marketingprogram'
]

nested = ''
duplicates = ''
data_discrepancy = ''
new_fields = ''
new_entities = ''
#for m in sqlContext.tables("dsc10742_gcctmsd_lz_db").select('tableName').where("tableName not like '%bad%'").where("tableName not like '%dup%'").where("tableName not like '%bkp%'").where("tableName not like '%temptz%'").collect():
#k=m.asDict().values()[0].encode('utf-8')
for k in tables_gcct_except_bad_rec:
    v1 = sqlContext.table(k).filter(
        "to_date(lastupdatedatetime)  = date_sub(CAST(current_timestamp() as DATE), "
        + str(days_to_go_back) + ")")
    g1_1 = v1.count()
    g1 = g1 + g1_1
    v3 = {}
    v2 = v1.select("entitystring").rdd.map(lambda x: x.asDict().values(
    )[0].encode('utf-8')).map(lambda u: str(u).replace('null', 'None').replace(
        'false', 'False').replace('true', 'True')).map(
            lambda y: ast.literal_eval(ast.literal_eval(y)['entitydata'])).map(
                lambda u: get_nested_keys(u)).filter(
                    lambda x: len(x) > 0).collect()
    v3 = set([item.lower() for sublist in v2 for item in sublist])
    if len(v3) > 0:
        print "\\nThe table " + str(k) + " seem to have nested data " + str(
            v3).replace("set([", "").replace("])", "") + "\\n\\n"
        nested = 'Nested data is observed'
Пример #7
0
Json_location="/project/dsc/prod/archive/60263_fsm"

sqlContext.sql("use "+db)

sqlContext.sql("show tables").show()

messages_2_days=commands.getoutput("hadoop fs -ls  "+Json_location+" | awk -F' +' -v g=\"{\" -v h=\"}\" -v q=\"'\" -v s=\":\" '{print g q$8q s q$6q h}'").splitlines()[1:]
msg_2_days_dict={}
for i in messages_2_days:
    dict_=ast.literal_eval(i)
    if dict_.keys()[0]!='':
        msg_2_days_dict.update(dict_)
msg_2_days_dict


list_ids_in_db=sqlContext.table("audit").select("id").rdd.map(lambda l356: l356["id"]).collect()
#list_ids_in_db=[i424["id"] for i424 in list_ids_in_db_row if i424["id"] !='']
#list_ids_in_db

list_2_load_keys=list(set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:2]
if len(list_2_load_keys)==0:
    print("No data to load")
    exit()
list_2_load_str="/* ".join(filter(None, list_2_load_keys))+"/*"
list_2_load_str


ids_2_load=[[i425,msg_2_days_dict[i425]] for i425 in list_2_load_keys]

#ids_2_load=[i400.split(" ") for i400 in list_ids_2_load]
R = Row('id', 'date_')
Пример #8
0
retweeted = retweeted.withColumn("ratio_tweet_retweeted", retweeted["retweeted"]/retweeted["tweets"])\
                    .orderBy("ratio_tweet_retweeted", ascending=False)

retweeted.limit(10).show()

# *************** VISUALIZACIÓN CON HIVE ************************

hiveContext = HiveContext(sc)

# En este caso, tenemos tablas ya guardadas en Hive.Podemos verlas con el comando:

hiveContext.tables().show()

# Cargamos la información de la tabla tweets

tweets = hiveContext.table("tweets28a")
print("\nLos datos cargados incluyen {} tweets\n".format(tweets.count()))

tweets.printSchema()

# Analizamos los tweets que están geolocalizados

hiveContext.sql('DROP TABLE IF EXISTS tweets')
hiveContext.registerDataFrameAsTable(tweets, "tweets")
tweets_place = hiveContext.sql(
    "SELECT place.name, COUNT(text) AS tweets FROM tweets WHERE place IS NOT NULL GROUP BY place.name ORDER BY tweets DESC"
)
tweets_place.limit(10).show()

# Podemos hacer el mismo análisis a través de RDDs
Пример #9
0
sqlContext.sql("use " + db)

#sqlContext.sql("show tables").show()

messages_2_days = commands.getoutput(
    "hadoop fs -ls  " + Json_location +
    " | awk -F' +' -v g=\"{\" -v h=\"}\" -v q=\"'\" -v s=\":\" '{print g q$8q s q$6q h}'"
).splitlines()[1:]
msg_2_days_dict = {}
for i in messages_2_days:
    dict_ = ast.literal_eval(i)
    if dict_.keys()[0] != '':
        msg_2_days_dict.update(dict_)
msg_2_days_dict

list_ids_in_db = sqlContext.table("latluri1.fsm_audit").select("id").rdd.map(
    lambda l356: l356["id"]).collect()
#list_ids_in_db=[i424["id"] for i424 in list_ids_in_db_row if i424["id"] !='']
#list_ids_in_db

list_2_load_keys = list(
    set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:3]
if len(list_2_load_keys) == 0:
    print("No Process ids to load")
    exit()
list_2_load_str = "/* ".join(filter(None, list_2_load_keys)) + "/*"
list_2_load_keys_ids = [str(i429).split("/")[-1] for i429 in list_2_load_keys]

print("\n\nThe following processids are loaded " +
      str(list_2_load_keys_ids).replace("[", "").replace("]", ""))

ids_2_load = [[i425, msg_2_days_dict[i425]] for i425 in list_2_load_keys]
Пример #10
0
ignore_list=[col.lower() for col in ignore_list_1]
g1=0
g2=0
sqlContext.sql("use dsc60230_gcct_tz_db")

tables_gcct_except_bad_rec=['account','activityparty','annotation','appointment','businessunit','calendar','calendarrule','contact','cxlvhlp_chatactivity','cxlvhlp_chatqueuestatistic','cxlvhlp_surveyitem','email','fax','gcct_accountresponsibleagent','gcct_additionalsymptomcodes','gcct_addportalmessage','gcct_arbitrationclaimprocessing','gcct_buybackevaluationmilestones','gcct_caseassignment','gcct_caseclassification','gcct_casedispositiontype','gcct_coachback','gcct_country','gcct_customersatisfactiontools','gcct_delegationofauthority','gcct_demandltrtpsmclaimsprocessing','gcct_doaprogramcode','gcct_documentcustomerrecontact','gcct_engine','gcct_executive','gcct_executiveliaison','gcct_fieldinvolvementassistancerequest','gcct_fmcccontract','gcct_fulfillmentvendor','gcct_genericinformation','gcct_genericinformationtopic','gcct_generictopic','gcct_goodfaithreview','gcct_knownpartsdelay','gcct_lemonlawcriteria','gcct_loanercost','gcct_loyaltyassistance','gcct_loyaltycostdetails','gcct_loyaltyprogramcode','gcct_material','gcct_materialrequest','gcct_materialrequestdetail','gcct_offer','gcct_offerdetail','gcct_onlinegaragevehicles','gcct_partsorderstatus','gcct_partsordersystems','gcct_partssmeassistancerequest','gcct_pcarequest','gcct_priordealerdecision','gcct_qaincident','gcct_qamonitoring','gcct_queuepriority','gcct_ravprocessingmilestones','gcct_reactdata','gcct_reacttransmission','gcct_reasoncode','gcct_recall','gcct_region','gcct_rentaldetail','gcct_slaconfiguration','gcct_sms','gcct_socialmediaconversation','gcct_socialmediamessage','gcct_socialmediaprofile','gcct_specialloanercode','gcct_state','gcct_surveyconfiguration','gcct_symptomclassification','gcct_tasktype','gcct_technicalassistancerequest','gcct_testdriverequest','gcct_timezone','gcct_userdelegationofauthority','gcct_vehicle','gcct_vehiclebrand','gcct_vehicleclass','gcct_vehicleloyaltyallowance','gcct_vehicleoffroad','gcct_vehicleowner','gcct_warrantyhistory','gcct_warrantyloanerpartsdelay','gcct_warrantyloanerrequest','gcct_warrantyloanerrequesthistory','globaloptionsetmetadata','incident','incidentresolution','letter','msdyn_answer','msdyn_question','msdyn_questionresponse','msdyn_survey','msdyn_surveyinvite','msdyn_surveyresponse','optionsetmetadata','phonecall','queue','queueitem','sla','slaitem','slakpiinstance','socialactivity','socialprofile','statemetadata','statusmetadata','systemuser','task','team','teammembership','territory']

nested=''
duplicates=''
data_discrepancy=''
new_fields=''
new_entities=''
#for m in sqlContext.tables("dsc60230_gcct_tz_db").select('tableName').where("tableName not like '%bad%'").where("tableName not like '%dup%'").where("tableName not like '%bkp%'").where("tableName not like '%temptz%'").collect():
            #k=m.asDict().values()[0].encode('utf-8')
for k in tables_gcct_except_bad_rec:
            v1=sqlContext.table(k).filter("to_date(lastupdatedatetime)  = date_sub(CAST(current_timestamp() as DATE), 1)")
            g1_1=v1.count()
            g1=g1+g1_1
            v3={}
            v2=v1.select("entitystring").rdd.map(lambda x:x.asDict().values()[0].encode('utf-8')).map(lambda u:str(u).replace('null','None').replace('false','False').replace('true','True')).map(lambda y:ast.literal_eval(ast.literal_eval(y)['entitydata'])).map(lambda u: get_nested_keys(u)).filter(lambda x: len(x) > 0).collect()
            v3=set([item.lower() for sublist in v2 for item in sublist])
            if len(v3) >0:
                print "\\nThe table "+str(k)+" seem to have nested data " +str(v3).replace("set([","").replace("])","")+"\\n\\n"
		nested='Nested data is observed'
            y1=v1.groupby("shakey").count().filter("count >1")
            if y1.count()>0:
                y2=y1.select("shakey").toPandas().to_string(index=False).replace("\n",";")
                print("\\nTable: "+str(k)+" has "+str(y1.count())+" duplicate(s)"+str(y2)+"\\n\\n")
		duplicates='Duplicate(s) are observed'
            col_list=set([col.lower() for col in v1.columns])
            v4=v1.select("entitystring").rdd.map(lambda x:x.asDict().values()[0].encode('utf-8')).map(lambda u:str(u).replace('null','None').replace('false','False').replace('true','True')).map(lambda y:ast.literal_eval(ast.literal_eval(y)['entitydata']).keys()+ast.literal_eval(y).keys()).map(lambda w:[x.lower() for x in w]).map(lambda x: set(x).difference(col_list)).collect()