Exemplo n.º 1
0
def get_query7():
    plt.clf()
    # os.remove("Query7.csv")
    # os.remove("graph7.png")
    qandriod = spark.sql(
        "select 'Andriod' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%and%'"
    )
    qiphone = spark.sql(
        "select 'Iphone' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%iphone%' "
    )
    qipad = spark.sql(
        "select 'Ipad' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%ipad%' "
    )
    qwebpage = spark.sql(
        "select 'Web App' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%Web App%' "
    )
    query_7 = qandriod.union(qiphone).union(qipad).union(qwebpage)
    pd = query_7.toPandas()
    pd.to_csv("Query7.csv", index=False)
    plt.bar(x=pd.DeviceName.tolist(), height=pd.TweetsCount.tolist())
    plt.title("Devices Used for Tweets")
    plt.xlabel('Source')
    plt.ylabel('TweetsCount')
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph7.png")
    graph7 = BytesIO()
    plt.savefig(graph7)
    graph7.seek(0)
    return graph7
Exemplo n.º 2
0
def datasetTrain_ds(typologie_add_0):
    
    max_date = spark.sql("select max(ref_chargement) from" + "name")
    ref_charg = max_date.collect()[0][0]
 
    dataset_code_lib_DS_ok0=spark.sql("select * from prd_tech.b_r_datasmart_ml_03_code_libelle_03_learn_ds where ref_chargement='{}'".format(ref_charg))
    dataset_code_lib_DS_ok1 = dataset_code_lib_DS_ok0.toPandas()

    dataset_code_lib_DS_ok1['temp']=dataset_code_lib_DS_ok1['nom_champ1']+','+dataset_code_lib_DS_ok1['nom_champ2']
    A=np.unique(dataset_code_lib_DS_ok1['temp'],return_index=True)
    dataset_code_lib_DS_ok2=dataset_code_lib_DS_ok1.loc[A[1],:].reset_index(drop=True)
    dataset_code_lib_DS_ok2.drop(['temp'],axis=1,inplace=True)
    dataset_code_lib_DS_ok3=dataset_code_lib_DS_ok2[pd.isnull(dataset_code_lib_DS_ok2['statut'])==False]
    dataset_code_lib_DS_ok4=dataset_code_lib_DS_ok3.merge(typologie_add_0.loc[:,['nom_champ1','typologie1']],on=['nom_champ1'])
    dataset_code_lib_DS_ok5=dataset_code_lib_DS_ok4.merge(typologie_add_0.loc[:,['nom_champ2','typologie2']],on=['nom_champ2'])
    dataset_code_lib_DS_ok6=dataset_code_lib_DS_ok5[dataset_code_lib_DS_ok5['typologie1']=='code/identifiant']
    dataset_code_lib_DS_ok7=dataset_code_lib_DS_ok6[dataset_code_lib_DS_ok6['typologie2']=='libelle']
    dataset_code_lib_DS_ok7['source']='manuelDS'
    
    dataset_code_lib_DS_ok7['temp']=dataset_code_lib_DS_ok7['nom_champ1']+','+dataset_code_lib_DS_ok7['nom_champ2']
    dataset_code_lib_DS_ok7['longuest']=dataset_code_lib_DS_ok7['temp'].apply(longest_common_substring)
    dataset_code_lib_DS_ok7['len_longuest']=dataset_code_lib_DS_ok7['longuest'].apply(len)
    dataset_code_lib_DS_ok7.drop(['temp','longuest'],axis=1,inplace=True)
    
    del dataset_code_lib_DS_ok0,dataset_code_lib_DS_ok1,dataset_code_lib_DS_ok2,dataset_code_lib_DS_ok3,dataset_code_lib_DS_ok4
    del dataset_code_lib_DS_ok5,dataset_code_lib_DS_ok6
    return dataset_code_lib_DS_ok7  
Exemplo n.º 3
0
def readTable(name):
    max_date = spark.sql("select max(ref_chargement) from" + "name")
    ref_charg = max_date.collect()[0][0]

    table_add_0= spark.sql("select * from "+"name"+ "where ref_chargement='{}'".format(ref_charg))
    table_add_0 = table_add_0.toPandas()
    typologie_add_0= spark.sql("select * from prd_tech.b_r_datasmart_ml_02_typologie_03_predict_primitive_all where ref_chargement='{}'".format(ref_charg))
    typologie_add_0 = typologie_add_0.toPandas()
Exemplo n.º 4
0
def get_query12():
    plt.clf()
    # os.remove("Query12.csv")
    # os.remove("graph12.png")
    query_12 = spark.sql(
        "select sum(geo_count) as geo_count,geo_enabled from (select count(*) as geo_count,geo_enabled from user where geo_enabled is not null group by geo_enabled union select count(*) as geo_count,user.geo_enabled as geo_enabled from user_retweeted where user.geo_enabled is not null group by user.geo_enabled) group by geo_enabled"
    )
    pd = query_12.toPandas()
    pd.to_csv('Query12.csv', index=False)
    pd.plot.pie(y="geo_count",
                labels=pd.geo_enabled.tolist(),
                autopct='%1.2f%%',
                fontsize=10)
    my_circle = plt.Circle((0, 0), 0.5, color='white')
    p = plt.gcf()
    p.gca().add_artist(my_circle)
    plt.title("Geo Location Settings Enabled for Account")
    plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph12.png")
    graph12 = BytesIO()
    plt.savefig(graph12)
    graph12.seek(0)
    return graph12
Exemplo n.º 5
0
def get_query9():
    plt.clf()
    # os.remove("Query9.csv")
    # os.remove("graph9.png")
    query_9 = spark.sql(
        "select sum(img_count) as img_count,profile_use_background_image as background_img from (select count(*) as img_count,profile_use_background_image from user where profile_use_background_image is not null group by profile_use_background_image union select count(*) as img_count,user.profile_use_background_image as profile_use_background_image from user_retweeted where user.profile_use_background_image is not null group by user.profile_use_background_image) group by profile_use_background_image"
    )
    pd = query_9.toPandas()
    pd.to_csv('Query9.csv', index=False)
    # pd.set_index("background_img", drop=True, inplace=True)
    pd.plot.pie(y="img_count",
                labels=pd.background_img.tolist(),
                autopct='%1.2f%%',
                fontsize=10)
    my_circle = plt.Circle((0, 0), 0.5, color='white')
    p = plt.gcf()
    p.gca().add_artist(my_circle)
    plt.title("User's Profile Background")
    plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph9.png')
    graph9 = BytesIO()
    plt.savefig(graph9)
    graph9.seek(0)
    return graph9
Exemplo n.º 6
0
def json_dataset(spark, path):
    sc = spark.sparkContext
    # A JSON dataset is pointed to by path.

    #peopleDF = spark.read\
    #                .option("multiLine", true)\
    #                .option("mode", "PERMISSIVE")\
    #                .json(path)

    dataRDD = spark.read\
                    .json(sc.wholeTextFiles(path)\
                    .values())

    print("RDD")
    dataRDD.show(10)

    # The inferred schema can be visualized using the printSchema() method
    dataRDD.printSchema()

    # Creates a temporary view using the DataFrame
    dataRDD.createOrReplaceTempView("table_data")

    # SQL statements can be run by using the sql methods provided by spark
    dataDF = spark.sql("SELECT * FROM table_data \
            where longitude not like '%not%relevant%' and latitude not like '%not%relevant%'"
                       )
    print('DF')
    #dataDF.show(10)
    return dataRDD, dataDF
Exemplo n.º 7
0
def get_query11():
    plt.clf()
    # os.remove("Query11.csv")
    # os.remove("graph11.png")
    query_11 = spark.sql(
        "select usermention, count(*) as value from entity lateral view explode(entity.user_mentions.name) as usermention group by usermention order by value desc limit 20"
    )
    pd = query_11.toPandas()
    pd.to_csv('Query11.csv', index=False)
    matplotlib.rcParams['figure.figsize'] = 15, 5
    x = pd.usermention.tolist()
    y = pd.value.tolist()
    plt.plot(x, y)
    plt.title("Most Mentioned Twitter Accounts")
    plt.xlabel('Tweeted User')
    plt.ylabel('User mentioned count')
    plt.xticks(rotation=45, ha="right")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph11.png')
    graph11 = BytesIO()
    plt.savefig(graph11)
    graph11.seek(0)
    return graph11
Exemplo n.º 8
0
def get_query5():
    plt.clf()
    # os.remove("graph5.png")
    date = df.select("created_at")

    def dateMTest(dateval):
        dt = datetime.datetime.strptime(dateval, '%a %b %d %H:%M:%S +0000 %Y')
        return dt

    d = udf(dateMTest, DateType())
    df1 = df.withColumn("created_date", d(date.created_at))
    df1.createOrReplaceTempView("Tweet_Sentiment")
    query_5 = spark.sql(
        "SELECT id,text,created_date  FROM Tweet_Sentiment WHERE 1=1 AND (upper(text) LIKE '%COVID%' OR text LIKE '%coronavirus%' or text like '%covid%' or upper(text) like '%CORONAVIRUS%')"
    )
    i = 0
    positive = 0
    neutral = 0
    negative = 0
    for t in query_5.select("text").collect():
        i = i + 1
        # print("It is ",i,str(t.text))
        analysis = TextBlob(str((t.text).encode('ascii', 'ignore')))
        # print(analysis.sentiment.polarity)
        if (analysis.sentiment.polarity < 0):
            negative = negative + 1
            # print(i, " in negative")
        elif (analysis.sentiment.polarity == 0.0):
            neutral = neutral + 1
            # print(i, " in neutral")
        elif (analysis.sentiment.polarity > 0):
            positive = positive + 1
            # print(i, " in positive")
    # print("Total negative % is", ((negative) * 100) / i)
    # print("Total neutral % is", ((neutral) * 100) / i)
    # print("Total positive % is", ((positive) * 100) / i)
    negative_percent = ((negative) * 100) / i
    positive_percent = ((positive) * 100) / i
    neutral_percent = ((neutral) * 100) / i
    size_of_groups = [negative_percent, positive_percent, neutral_percent]
    names = 'negative_percent', 'positive_percent', 'neutral_percent'
    plt.bar(x=names, height=size_of_groups, color=['red', 'green', 'blue'])
    plt.title("Sentiment Analysis on Tweet")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph5.png')
    graph5 = BytesIO()
    plt.savefig(graph5)
    graph5.seek(0)
    return graph5
Exemplo n.º 9
0
def datasetTrain_add(name):
    max_date = spark.sql("select max(ref_chargement) from" + "name")
    ref_charg = max_date.collect()[0][0]
    
    dataset_code_lib_add_ok0=spark.sql("select * from prd_tech.b_r_datasmart_ml_03_code_libelle_03_learn_add where ref_chargement='{}'".format(ref_charg))
    dataset_code_lib_add_ok0 = dataset_code_lib_add_ok0.toPandas()

    dataset_code_lib_add_ok1=dataset_code_lib_add_ok0[pd.isnull(dataset_code_lib_add_ok0['statut'])==False]
    dataset_code_lib_add_ok1['temp']=dataset_code_lib_add_ok1['nom_champ1']+','+dataset_code_lib_add_ok1['nom_champ2']
    dataset_code_lib_add_ok1.reset_index(drop=True,inplace=True)
    A=np.unique(dataset_code_lib_add_ok1['temp'],return_index=True)
    dataset_code_lib_add_ok2=dataset_code_lib_add_ok1.loc[A[1],:].reset_index(drop=True)
    dataset_code_lib_add_ok2.drop(['temp'],axis=1,inplace=True)
    dataset_code_lib_add_ok2['source']='manuelADD'

    dataset_code_lib_add_ok2['temp']=dataset_code_lib_add_ok2['nom_champ1']+','+dataset_code_lib_add_ok2['nom_champ2']
    dataset_code_lib_add_ok2['longuest']=dataset_code_lib_add_ok2['temp'].apply(longest_common_substring)
    dataset_code_lib_add_ok2['len_longuest']=dataset_code_lib_add_ok2['longuest'].apply(len)
    dataset_code_lib_add_ok2.drop(['temp','longuest'],axis=1,inplace=True)
    
    return dataset_code_lib_add_ok2

    del dataset_code_lib_add_ok0,dataset_code_lib_add_ok1       
Exemplo n.º 10
0
def get_query13():
    plt.clf()
    # os.remove("Query13.csv")
    # os.remove("graph13.png")
    qcovid = spark.sql(
        "select 'covid' as keyword, count(*) as Count from tweets where text like '%covid%'"
    )
    qpandemic = spark.sql(
        "select 'pandemic' as keyword, count(*) as Count from tweets where text like '%pandemic%'"
    )
    qmask = spark.sql(
        "select 'masks' as keyword , count(*) as Count from tweets where text like '%masks%'"
    )
    qlock = spark.sql(
        "select 'lockdown' as keyword, count(*) as Count from tweets where text like '%lockdown%'"
    )
    qstayhome = spark.sql(
        "select 'stayhome' as keyword, count(*) as Count from tweets where text like '%stayhome%'"
    )
    query_13 = qcovid.union(qpandemic).union(qmask).union(qlock).union(
        qstayhome)
    pd = query_13.toPandas()
    pd.to_csv('Query13.csv', index=False)
    pd.set_index("keyword", drop=True, inplace=True)
    pd.plot.bar()
    plt.xticks(rotation=0, ha="right")
    plt.title("Inspect Based on Keywords")
    plt.ylabel('tweets count')
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph13.png")
    graph13 = BytesIO()
    plt.savefig(graph13)
    graph13.seek(0)
    return graph13
Exemplo n.º 11
0
def get_query17():
    plt.clf()
    # os.remove("Query17.csv")
    # os.remove("graph17.png")
    query_17 = spark.sql(
        "select(select count(*) from tweets)as tweets_count,(select count(*) from user)as Users_count"
    )
    pd = query_17.toPandas()
    pd.to_csv('Query17.csv', index=False)
    plt.bar(x=pd.Users_count.tolist(), height=pd.tweets_count.tolist())
    plt.xlabel('Users count')
    plt.ylabel('tweets count')
    plt.title("Users & Tweets Analyzed for Processing")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph17.png')
    graph17 = BytesIO()
    plt.savefig(graph17)
    graph17.seek(0)
    return graph17
Exemplo n.º 12
0
def get_query4():
    plt.clf()
    # os.remove("Query4.csv")
    # os.remove("graph4.png")
    query4 = spark.sql(
        "SELECT LOWER(hashtags.text) As Hashtags, COUNT(*) AS total_count FROM tweets LATERAL VIEW EXPLODE(entities.hashtags) AS hashtags GROUP BY LOWER(hashtags.text) ORDER BY total_count DESC LIMIT 20"
    )
    pd = query4.toPandas()
    pd.to_csv('Query4.csv', index=False)
    plt.plot(pd.Hashtags.tolist(), pd.total_count.tolist())
    plt.xticks(rotation=45, ha="right")
    plt.title("Trending Hashtags")
    plt.xlabel('Hashtags')
    plt.ylabel('Count')
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph4.png")
    graph4 = BytesIO()
    plt.savefig(graph4)
    graph4.seek(0)
    return graph4
Exemplo n.º 13
0
def get_query1():
    plt.clf()
    # os.remove("Query1.csv")
    # os.remove("graph1.png")
    query_1 = spark.sql(
        "SELECT user.name,user.screen_name,user.id,geo.coordinates[0] As latt,geo.coordinates[1] As long from tweets where geo is not null"
    )
    pd = query_1.toPandas()
    pd.to_csv('Query1.csv', index=False)
    geometry = [Point(xy) for xy in zip(pd['long'], pd['latt'])]
    gdf = GeoDataFrame(pd, geometry=geometry)
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    gdf.plot(ax=world.plot(), marker='o', color='red', markersize=15)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title("Geo Locations of Corona Tweets")
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph1.png')
    graph1 = BytesIO()
    plt.savefig(graph1)
    graph1.seek(0)
    return graph1
Exemplo n.º 14
0
def get_query14():
    plt.clf()
    # os.remove("Query14.csv")
    # os.remove("graph14.png")
    query_14 = spark.sql(
        "select  place.country as country , count(*) as Count from tweets where place.country IS not null Group by place.country order by count DESC Limit 10"
    )
    pd = query_14.toPandas()
    pd.to_csv('Query14.csv', index=False)
    x = pd.country.tolist()
    y = pd.Count.tolist()
    plt.stackplot(x, y)
    plt.title("Twitter Most Used Countries")
    plt.xlabel('Country')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha="right")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph14.png')
    graph14 = BytesIO()
    plt.savefig(graph14)
    graph14.seek(0)
    return graph14
Exemplo n.º 15
0
def get_query6():
    plt.clf()
    # os.remove("Query6.csv")
    # os.remove("graph6.png")
    query_6 = spark.sql(
        "select lang, count(*) as count from tweets where lang is not null group by lang order by count desc limit 10"
    )
    pd = query_6.toPandas()
    pd.to_csv('Query6.csv', index=False)
    pd.plot.pie(y="count",
                labels=pd.lang.tolist(),
                autopct='%1.2f%%',
                shadow=False,
                legend=False,
                fontsize=8)
    plt.title("Top Languages Used in Profile")
    plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph6.png")
    graph6 = BytesIO()
    plt.savefig(graph6)
    graph6.seek(0)
    return graph6
Exemplo n.º 16
0
def get_query3():
    plt.clf()
    # os.remove("Query3.csv")
    # os.remove("graph3.png")
    query_3 = spark.sql(
        "select  t.Name,t.Retweeted_Count from(select user.screen_name as Name,SUM(retweet_count) as Retweeted_Count, count(*) from user_retweeted where user.screen_name is not null group by Name ) t order by Retweeted_Count DESC LIMIT 20"
    )
    pd = query_3.toPandas()
    pd.to_csv('Query3.csv', index=False)
    x = pd.Name.tolist()
    y = pd.Retweeted_Count.tolist()
    plt.stackplot(x, y)
    plt.title("Most Influential Person Based on Retweet")
    plt.xlabel('User Name')
    plt.ylabel('Retweeted count')
    plt.xticks(rotation=45, ha="right")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph3.png')
    graph3 = BytesIO()
    plt.savefig(graph3)
    graph3.seek(0)
    return graph3
Exemplo n.º 17
0
def get_query2():
    plt.clf()
    # os.remove("Query2.csv")
    # os.remove("graph2.png")
    query_2 = spark.sql(
        "select name, followers_count from user where name is not null and verified=='true' order by followers_count DESC LIMIT 20"
    )
    pd = query_2.toPandas()
    pd.to_csv('Query2.csv', index=False)
    pd.plot.pie(y="followers_count",
                labels=pd.name.tolist(),
                autopct='%1.2f%%',
                shadow=False,
                legend=False,
                fontsize=8)
    plt.title("Followers Count of verified Users")
    plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph2.png')
    graph2 = BytesIO()
    plt.savefig(graph2)
    graph2.seek(0)
    return graph2
Exemplo n.º 18
0
def get_query16():
    plt.clf()
    # os.remove("Query16.csv")
    # os.remove("graph16.png")
    query_16 = spark.sql(
        "select text, count(*) as no_of_tweets from tweets where text is not null and retweeted_status.id is not null and lang='en' group by text order by no_of_tweets desc limit 10"
    )
    pd = query_16.toPandas()
    pd.to_csv('Query16.csv', index=False)
    pd.plot.pie(y="no_of_tweets",
                labels=pd.text.tolist(),
                autopct='%1.2f%%',
                shadow=False,
                legend=False,
                fontsize=8)
    plt.title("Top Retweeted Tweets(Tweet in english language)")
    plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph16.png')
    graph16 = BytesIO()
    plt.savefig(graph16)
    graph16.seek(0)
    return graph16
Exemplo n.º 19
0
def get_query15():
    plt.clf()
    # os.remove("Query15.csv")
    # os.remove("graph15.png")
    query_15 = spark.sql(
        "select screen_name as Name, location as Location, followers_count as FollowersCount  from user  where location is not null ORDER BY followers_count DESC LIMIT 10"
    )
    pd = query_15.toPandas()
    pd.to_csv("Query15.csv", index=False)
    pd.plot.pie(y="FollowersCount",
                labels=pd.Name.tolist(),
                autopct='%1.2f%%',
                shadow=False,
                legend=False,
                fontsize=8)
    plt.title("Accounts With More Followers Count Based on Location")
    plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph15.png")
    graph15 = BytesIO()
    plt.savefig(graph15)
    graph15.seek(0)
    return graph15
Exemplo n.º 20
0
def get_query8():
    plt.clf()
    # os.remove("Query8.csv")
    # os.remove("graph8.png")
    query_8 = spark.sql(
        "select count(*) as TweetCount,SUBSTR(user.created_at,0,10) as TWEETDATE from tweets where created_at IS not null GROUP BY user.created_at ORDER BY TweetCount DESC LIMIT 30"
    )
    pd = query_8.toPandas()
    pd.to_csv('Query8.csv', index=False)
    y = pd.TweetCount.tolist()
    x = pd.TWEETDATE.tolist()
    plt.plot(x, y, alpha=0.5)
    plt.title("Stats of Tweets Per Day")
    plt.xlabel('Tweet Date')
    plt.ylabel('Tweet Count')
    plt.xticks(rotation=45, ha="right")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig('graph8.png')
    graph8 = BytesIO()
    plt.savefig(graph8)
    graph8.seek(0)
    return graph8
Exemplo n.º 21
0
def get_query10():
    plt.clf()
    # os.remove("Query10.csv")
    # os.remove("graph10.png")
    query_10 = spark.sql(
        "select name,friends_count as friends from user order by friends desc limit 20"
    )
    pd = query_10.toPandas()
    pd.to_csv('Query10.csv', index=False)
    x = pd.name.tolist()
    y = pd.friends.tolist()
    plt.plot(x, y, alpha=0.5)
    plt.title("Trends on Number of friends")
    plt.xlabel('User Name')
    plt.ylabel('Friends Count')
    plt.xticks(rotation=45, ha="right")
    # plt.axis('equal')
    plt.tight_layout()
    # plt.show()
    plt.savefig("graph10.png")
    graph10 = BytesIO()
    plt.savefig(graph10)
    graph10.seek(0)
    return graph10
Exemplo n.º 22
0
            'New Daily Clustering Bikes by location Model has been trained on the {}'
            .format(datetime.now()))

    if mode == "predicting":
        model = KMeansModel.load(modelpath)
        logger.info(
            'Daily Clustering Bikes by location Started on the {} '.format(
                datetime.now()))

        transformed_tot = model.transform(dataDF).withColumnRenamed(
            "prediction", "cluster_id")

        #creating output dataset
        transformed_tot.createOrReplaceTempView("data_table")
        data_table = spark.sql(
            "SELECT data_table.id, data_table.name, data_table.address, data_table.latitude, data_table.longitude, data_table.position, data_table.coordinates, data_table.cluster_id FROM data_table"
        )
        data_table.coalesce(1).write.format('json').save(
            datapath_output_file1 + currentdate + '.json')
        logger.info('Daily Output file {} generated on {}'.format(
            datapath_output_file1 + currentdate + '.json', datetime.now()))
        transformed_tot.cache()

        # creating the centroides grouping and output file for this gouping
        centerList = list()
        centers1 = model.clusterCenters()
        count = int()
        for center in centers1:
            temp_list = center.tolist()
            temp_list.append(count)
            centerList.append(temp_list)
Exemplo n.º 23
0
import pandas as pd

## constants
observ_daterange = '1910_1912'
label_daterange = '2001'

## HIVE SQL
cj_1910_1912 = spark.sql("""
    SELECT distinct actor_id, event, yyyymm AS behavior_yyyymm, action_sponsor, action_time, action_type, object_id, 
    twd_amt, txn_currency_code, txn_type_desc, atm_location, target_acct_nbr, target_bank_code
    FROM vp_bank.customer_journey_event a 
    lateral view json_tuple(a.attrs, "action") attr_a as attrs_action
    lateral view json_tuple(a.attrs, "channel") attr_c as attrs_channel
    lateral view json_tuple(a.attrs, "object") attr_o as attrs_object
    lateral view json_tuple(attr_a.attrs_action, "twd_amt") n3 as twd_amt
    lateral view json_tuple(attr_a.attrs_action, "txn_currency_code") n4 as txn_currency_code
    lateral view json_tuple(attr_a.attrs_action, "txn_type_desc") n5 as txn_type_desc
    lateral view json_tuple(attr_c.attrs_channel, "atm_location") n4 as atm_location
    lateral view json_tuple(attr_o.attrs_object, "target_acct_nbr") n4 as target_acct_nbr
    lateral view json_tuple(attr_o.attrs_object, "target_bank_code") n4 as target_bank_code
    where event in ("atm_transaction", "myatm_transaction", "mybank_transaction") 
        and yyyymm between "201910" and "201912"
    order by actor_id, action_time asc    
""")

accounts_2001 = spark.sql = ("""
    SELECT distinct customer_id, acct_nbr, customer_class_code, yyyymm, start_ym, label
    FROM btmp_cmd.nt83716_cip_acct_base_ds
    WHERE yyyymm = "202001"
""")
Exemplo n.º 24
0
nd11 = d1.map(lambda x: (x[0].split("_")[0], x[1], x[2], x[3]))

nd1 = nd11.map(lambda x: (x[0:2], x[2:]))

nd2 = d2.map(lambda x: (x[0:2], x[2:]))

join = nd1.join(nd2)

spark = SparkSession(sc)

df = join.map(lambda (k, v): Row(k, v[0], v[1])).toDF()

df.createTempView("temp")

temp = spark.sql(
    "select _1._1 as Name, _1._2 as Degree, _2._1 as Country, _2._2 as State, _3._1 as Hotel_1, _3._2 as Hotel_2, "
    "cast (_3._3 as int) as Salary_1, cast (_3._4 as int) as Salary_2 from temp "
)

temp.createTempView("resume")

res = spark.sql("select * from resume")

res.show()

res1 = spark.sql(
    "select Name, ((Salary_1+Salary_2)/2) as Average_Salary from resume where Degree like 'Bachelor%'"
)

res1.show()

# res1.write.option("header","true").csv("/home/lucky/Downloads/Output/Result1.csv")
import json
import spark as spark
import time as today


def convertToJson(json_list):
    load_date = today.strftime("%Y-%m-%d")
    client = 'zeus'

    output = load_date + '|' + client + '|' + str(json.dumps(json_list))
    return output


jsonDF = spark.sql("""select * from json_ready_tbl""")
#jsonRDD = jsonDF.rdd.map(lambda x: convertToJson(x)).saveAsTextFile('/hdfs/output/dir')

# n is the number of batch
n = 1000
i = 0
json_list = []
for jdf in jsonDF:
    json_list.append(jdf)
    i += 1
    if i > n - 1:
        output = convertToJson(json_list)
        #print(output)
        jsonRDD = jsonDF.rdd.map(output).saveAsTextFile('/hdfs/output/dir')
        i = 0
        json_list = []
Exemplo n.º 26
0
import spark

table_list = ['table_1', 'table_2']
for t in table_list:
    df = spark.sql("""desc formatted schema_name.{table_name}""".format(
        table_name=t))
    print(df.filter("col_name=='Location'").collect()[0].data_type)

# for one table

# spark.sql("""desc formatted schema_name.table_name""").filter("col_name=='Location'").collect()[0].data_type