Пример #1
0
def run_spark():
    spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()
    df = spark.read.format("csv").option("header", "true").load("hdfs:///project/samples
    sqlContext=SQLContext(spark)
    sqlContext.registerDataFrameAsTable(df, "table1")
    sentences=sqlContext.sql("""SELECT `reviews.rating`,`reviews.text` FROM table1""").rdd
    sentences.collect()
    alist=sentences.map(lambda x:x[0] if x[0] is not None else 0).collect()
    blist=sentences.map(lambda x:get_score(x[1]) if x[1] is not None else 0).collect()
    return alist,blist

def draw_plots(alist,blist)
    alist,blist = run_spark()
    t_plt, = plt.plot(np.arange(1, len(alist)+1), alist, 'r')
    v_plt, = plt.plot(np.arange(1, len(alist)+1), blist)
    plt.title('NLP Emotion Analysis') #
    plt.xlabel('epoch') #
    plt.ylabel('score') #
    plt.legend((t_plt, v_plt), ('rating', 'score')) #
    plt.savefig("result.png")
    with open("x1.txt","w+") as input_:
        input_.write(','.join([str(x) for x in alist]))
        input_.write(','.join([str(x) for x in blist])) 

if __name__=='__main__':
    run_spark()
Пример #2
0
def ALS_fit():
    usern = request.args.get('usern')
    users_df = pd.read_sql_query(
        '''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''',
        engine)
    if usern not in users_df['user'].values:
        return_str = "can't find user"
        return jsonify(result=return_str)
    user_id = users_df.user_id[users_df.user == usern].values[0]
    try:
        key = request.args.get('key')
    except NameError:
        key = 'e'
    if key == 'abcd':
        #start spark

        try:
            conf = SparkConf().setAppName("BeerSleuthALS").set(
                "spark.executor.memory", "4g")
            sc = SparkContext(conf=conf)
        except ValueError:
            pass
        sqlContext = SQLContext(sc)
        ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext)
        sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
        print('fitting model')
        model = modeling.fit_final_model(ratings_sqldf)
        beer_ids = beer_dict.values()
        to_predict = zip([user_id] * len(beer_ids), beer_ids)
        to_predict_top20 = zip([user_id] * len(beer_id_filt), beer_id_filt)
        user_preds = model.predictAll(sc.parallelize(to_predict)).collect()
        user_preds_top20 = model.predictAll(
            sc.parallelize(to_predict_top20)).collect()
        print('got preds')
        preds = Counter({x[1]: x[2] for x in user_preds})
        preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20})
        with open('%s%s_preds.pkl' % (pred_path, user_id), 'wb') as f:
            pickle.dump(preds, f)
        with open('%s%s_preds_top20.pkl' % (pred_path, user_id), 'wb') as f:
            pickle.dump(preds_top20, f)

        print('done')
        sc.stop()
        return jsonify(
            result="Model training complete, you may now get predictions")
Пример #3
0
def ALS_fit():
    usern = request.args.get('usern')
    users_df = pd.read_sql_query('''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine)
    if usern not in users_df['user'].values:
        return_str =  "can't find user"
        return jsonify(result = return_str)
    user_id = users_df.user_id[users_df.user == usern].values[0]
    try: key = request.args.get('key')
    except NameError: key = 'e'
    if key == 'abcd':
            #start spark

        try:
             conf = SparkConf().setAppName("BeerSleuthALS").set("spark.executor.memory", "4g")
             sc = SparkContext(conf=conf)
        except ValueError: pass
        sqlContext = SQLContext(sc)
        ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext)
        sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
        print('fitting model')
    	model = modeling.fit_final_model(ratings_sqldf)
        beer_ids = beer_dict.values()
        to_predict = zip([user_id]*len(beer_ids), beer_ids)
	to_predict_top20 = zip([user_id]*len(beer_id_filt), beer_id_filt)
        user_preds = model.predictAll(sc.parallelize(to_predict)).collect()
	user_preds_top20 = model.predictAll(sc.parallelize(to_predict_top20)).collect()
        print('got preds')
        preds = Counter({x[1]: x[2] for x in user_preds})
	preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20})
        with open('%s%s_preds.pkl'%(pred_path, user_id),'wb') as f:
            pickle.dump(preds, f)
        with open('%s%s_preds_top20.pkl'%(pred_path, user_id),'wb') as f:
            pickle.dump(preds_top20, f)

        print('done')
        sc.stop()
        return jsonify(result="Model training complete, you may now get predictions")
Пример #4
0
    #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
    input = '/input/loginfowlog/*'

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
        lists.append(t)
        #logger.debug(t)

    dao = MysqlDao()
Пример #5
0
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf().setMaster('local').setAppName('py03a')
sc = SparkContext(conf=conf)
sqc = SQLContext(sc)

#and show that you could exit your pyspark shell and come back in it
df = sqc.read.parquet('auction_parquet')
df.show(5)

sqc.registerDataFrameAsTable(df, 'auction_p')

print 'Bid history for bidder pagep123'
sqc.sql('''
        SELECT * FROM auction_p
        WHERE bidder = 'pagep123'
        ORDER BY auctionid, bid
          ''').show()
Пример #6
0

if __name__ == '__main__':
    # set up environment
    conf = SparkConf() \
      .setAppName("BeerSleuthALS") \
      .set("spark.driver.memory", "8g")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #load data
    engine = create_engine(
        'postgresql://*****:*****@localhost:5432/beersleuth')
    ratings_sqldf = get_item_user_rev_from_pg(engine, sqlContext)
    beer_sqldf = get_beer_data(engine)
    sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
#    train, test = sqlContext.table('ratings').randomSplit([.8, .2])
#    train = train.cache()
#    test = test.cache()
##    add_rating_to_db(user='******', beer=u'101 North Heroine IPA' , taste=8, engine=engine)
##    add_rating_to_db(user='******', beer=u'Boulder Creek Golden Promise' , taste=6, engine=engine)
##    model_param_sweep(train, test)
#    import timeit
#    start_time = timeit.default_timer()
#    model = fit_final_model(ratings_sqldf)
#    elapsed = timeit.default_timer() - start_time
'''
sim_dict={}
for i in beer_data.index:
     sim_dict[i] = Counter()
     for j in beer_data.index:
Пример #7
0
def Homepage():
    """Renders a sample page."""
    style.use('ggplot')
    a = 5
    '''
    Spark details:
    data was cleaned using spark -pandas library
    thh csv was converted to Dataframe using SQL context and then registered as a table which was accesses using SQL
    '''
    sc = SparkContext(appName="DemoCount")
    sqlct = SQLContext(sc)
    #pandas_df = pd.read_csv('C:/Users/madhumita/Downloads/Test_final.csv')
    pandas_df = pd.read_csv('Demo_data.csv')
    s_df1 = sqlct.createDataFrame(pandas_df)
    sqlct.registerDataFrameAsTable(s_df1, "Demo_2")
    df0_3 = sqlct.sql("Select * from Demo_2")
    df_0_3 = df0_3.groupby('CTYNAME').sum()

    df0_3 = sqlct.sql(
        "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >0 and AGEGRP <=3"
    )
    df_0_3 = df0_3.groupby('CTYNAME').sum()
    a0_3 = df_0_3.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)',
                         'sum(TOT_FEMALE)').collect()

    for row in a0_3:

        CityName = str(row[0])
        Tot_pop = str(row[1])
        Tot_Male = str(row[2])
        Tot_Female = str(row[3])
        AGEGRP = '0:3'
        print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" +
              Tot_Female)
        SQLCommand = (
            "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='0:3'")
        Values = [CityName]
        cursor.execute(SQLCommand, Values)
        if cursor.rowcount == 0:
            SQLCommand = ("INSERT INTO loc_tet "
                          "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) "
                          "VALUES (?,?,?,?,?)")
            Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female]
            cursor.execute(SQLCommand, Values)
            connection.commit()
        else:

            cursor.execute(
                """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and  AGEGRP='0:3'""",
                (Tot_pop, Tot_Male, Tot_Female, CityName))
            connection.commit()

    df4_9 = sqlct.sql(
        "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >4 and AGEGRP <=9"
    )
    df_4_9 = df4_9.groupby('CTYNAME').sum()

    b4_9 = df_4_9.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)',
                         'sum(TOT_FEMALE)').collect()

    for row in b4_9:

        CityName = str(row[0])
        Tot_pop = str(row[1])
        Tot_Male = str(row[2])
        Tot_Female = str(row[3])
        AGEGRP = '4:9'
        print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" +
              Tot_Female)
        SQLCommand = (
            "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='4:9'")
        Values = [CityName]
        cursor.execute(SQLCommand, Values)
        if cursor.rowcount == 0:
            SQLCommand = ("INSERT INTO loc_tet "
                          "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) "
                          "VALUES (?,?,?,?,?)")
            Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female]
            cursor.execute(SQLCommand, Values)
            connection.commit()
        else:

            cursor.execute(
                """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and  AGEGRP='4:9'""",
                (Tot_pop, Tot_Male, Tot_Female, CityName))
            connection.commit()

    df10_13 = sqlct.sql(
        "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >9 and AGEGRP <=13"
    )
    df_10_13 = df10_13.groupby('CTYNAME').sum()
    c10_13 = df_10_13.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)',
                             'sum(TOT_FEMALE)').collect()

    for row in c10_13:

        CityName = str(row[0])
        Tot_pop = str(row[1])
        Tot_Male = str(row[2])
        Tot_Female = str(row[3])
        AGEGRP = '10:13'
        print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" +
              Tot_Female)
        SQLCommand = (
            "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='10:13'")
        Values = [CityName]
        cursor.execute(SQLCommand, Values)
        if cursor.rowcount == 0:
            SQLCommand = ("INSERT INTO loc_tet "
                          "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) "
                          "VALUES (?,?,?,?,?)")
            Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female]
            cursor.execute(SQLCommand, Values)
            connection.commit()
        else:

            cursor.execute(
                """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and  AGEGRP='10:13'""",
                (Tot_pop, Tot_Male, Tot_Female, CityName))
            connection.commit()

    df13 = sqlct.sql(
        "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >13 "
    )
    df_13 = df13.groupby('CTYNAME').sum()
    d_13 = df_13.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)',
                        'sum(TOT_FEMALE)').collect()
    for row in d_13:

        CityName = str(row[0])
        Tot_pop = str(row[1])
        Tot_Male = str(row[2])
        Tot_Female = str(row[3])
        AGEGRP = '14:18'
        print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" +
              Tot_Female)
        SQLCommand = (
            "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='14:18'")
        Values = [CityName]
        cursor.execute(SQLCommand, Values)
        if cursor.rowcount == 0:
            SQLCommand = ("INSERT INTO loc_tet "
                          "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) "
                          "VALUES (?,?,?,?,?)")
            Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female]
            cursor.execute(SQLCommand, Values)
            connection.commit()
        else:

            cursor.execute(
                """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and  AGEGRP='14:18'""",
                (Tot_pop, Tot_Male, Tot_Female, CityName))
            connection.commit()

    return render_template("main.html",
                           Data0_3=a0_3,
                           Data4_9=b4_9,
                           Data10_13=c10_13,
                           Data13=d_13)
Пример #8
0

df=sqlContext.createDataFrame(l, ['name', 'age'])
# print(df)

d = [{'name': 'paul', 'age': 10,'gender':'male'},{'name': 'alice', 'age': 30,'gender':None}]
print(sqlContext.createDataFrame(d).collect())
# 
rdd = sc.parallelize(l)
df = sqlContext.createDataFrame(rdd, ['name', 'age'])
# print(df.collect())
#df=sqlContext.createDataFrame(rdd)
# print(df.printSchema())
print(df.head(2))

sqlContext.registerDataFrameAsTable(df, "table1")
df2 = sqlContext.sql("SELECT name,age from table1 where name='bob'")
print(df2.collect())
print(sqlContext.tableNames())
sqlContext.dropTempTable("table1")
print(sqlContext.tableNames())


df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("/home/harsh/mapping_minds_training/spark/train_u6lujuX_CVtuZ9i.csv")
print(df.groupBy('Gender').agg({'ApplicantIncome': 'mean'}).show())
print(df.head(3))
print(df.printSchema())
print(df.columns)
df.cache()
print('count-------------------------->',df.count())
Пример #9
0
tweets_sample = sqlContext.read.json(path_to_data)

print("El dataset cargado contiene %d tweets" % tweets_sample.count())

# Estudiamos el schema de los datos importados

print("\nShcema de los datos cargados:\n")
tweets_sample.printSchema()

print("\nVisualización de los datos:\n")
tweets_sample.show()

# *************** VISUALIZACIÓN CON SQL ************************

sqlContext.sql('DROP TABLE IF EXISTS tweets_sample')
sqlContext.registerDataFrameAsTable(tweets_sample, "tweets_sample")

# Ver usuarios con más tweets, incluyendo información adicional

users_agg = sqlContext.sql(
    "SELECT user.screen_name, MAX(user.friends_count) AS friends_count, MAX(user.followers_count) AS followers_count, user.lang, COUNT(text) AS tweets FROM tweets_sample WHERE user.lang = 'es' GROUP BY user.screen_name, user.lang ORDER BY tweets DESC"
)
users_agg.show()

# Cargamos la visualización en una tabla

sqlContext.sql('DROP TABLE IF EXISTS user_agg')
sqlContext.registerDataFrameAsTable(users_agg, "user_agg")

# Estudiamos los usuarios que han recibido más retweets, visualizando otra información adicional y calculando el ration de retweets por tweets
Пример #10
0
    day = pro_time.strftime("%Y%m%d")

    master = "spark://hadoop:7077"
    appName = "spark_pageflow_outflow"
    input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    sc = SparkContext(master, appName)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
    sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")

    _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
           "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
           "group by to_str(url),to_day(createtime)" % day

    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    list = []
    for r in rs:
        url = r[0]
        day = r[1]
        pv = r[2]
        uv = r[3]
  return [ record[i].replace('"','') for i in indexes]

def filterData(record):
  flag = True
  if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False
  return flag

if __name__ == '__main__':
  sc = SparkContext(appName = 'CF_prod_in_transaction')
  sqlContext = SQLContext(sc)
  in_file = sc.textFile(sys.argv[1])
  data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]])
  Record = Row('customer_id','product_id','invoice_id','units')
  data = data.map(lambda x: Record(*x))
  data = sqlContext.createDataFrame(data)
  sqlContext.registerDataFrameAsTable(data,'table1')
  df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id')
  df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2])
  sc.stop()



data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path
write = open('test.csv','w')
wrtr = csv.writer(write)

import csv
read = open('arqiva.csv')
for line in read: wrtr.writerow(line)

from e
Пример #12
0
     sim_df.drop_duplicates()
     return sim_df

if __name__ == '__main__':
    # set up environment
    conf = SparkConf() \
      .setAppName("BeerSleuthALS") \
      .set("spark.driver.memory", "8g")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #load data
    engine = create_engine('postgresql://*****:*****@localhost:5432/beersleuth')
    ratings_sqldf = get_item_user_rev_from_pg(engine, sqlContext)
    beer_sqldf = get_beer_data(engine)
    sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings")
#    train, test = sqlContext.table('ratings').randomSplit([.8, .2])
#    train = train.cache()
#    test = test.cache()
##    add_rating_to_db(user='******', beer=u'101 North Heroine IPA' , taste=8, engine=engine)
##    add_rating_to_db(user='******', beer=u'Boulder Creek Golden Promise' , taste=6, engine=engine)
##    model_param_sweep(train, test)
#    import timeit
#    start_time = timeit.default_timer()
#    model = fit_final_model(ratings_sqldf)
#    elapsed = timeit.default_timer() - start_time




Пример #13
0
class Analysiser:
    def __init__(self):

        conf = SparkConf().setAppName('Analysiser').set("spark.sql.crossJoin.enabled", True)
        self.sc = SparkContext(conf=conf)
        self.sqlctx = SQLContext(self.sc)

        self.pdf = pd.read_excel('data_o.xlsx', sheetname=0, header=0,  parse_cols=[9, 10, 23, 32, 45, 60])

        schema = StructType([
            StructField('TI',StringType(),True),
            StructField('SO', StringType(), True),
            StructField('C1', StringType(), True),
            StructField('TC', StringType(), True),
            StructField('PY', StringType(), True),
            StructField('UT', StringType(), True)

        ])

        df = self.sqlctx.createDataFrame(self.pdf,schema)

        def m_clean(x):
            try:
                py = int(x['PY'])
                tc = int(x['TC'])
                authors = x['C1']

                if py>=2006 and py<=2016 and authors != '':

                    first_author = authors[1:].split(']')[0].split('; ')[0]

                    return [(x['TI'],x['SO'],x['C1'],first_author,x['TC'],int(x['PY']),x['UT']),]
                else:
                    return []
            except Exception as e:
                return []

        schema2 = StructType([
            StructField('TI', StringType(), True),
            StructField('SO', StringType(), True),
            StructField('C1', StringType(), True),
            StructField('first_author', StringType(), True),
            StructField('TC', StringType(), True),
            StructField('PY', IntegerType(), True),
            StructField('UT', StringType(), True)

        ])
        self.df = self.sqlctx.createDataFrame(df.rdd.flatMap(m_clean),schema2)


        #self.df.show()


    # def parse(self):
    #     .wb = load_workbook('data_min.xlsx')
    #     sheet = wb.get_sheet_by_name('all')
    #     new_wb = openpyxl.Workbook()
    #     new_sheet = new_wb.create_sheet('simple')
    #     new_sheet.append(['TI', 'SO', 'C1', 'TC', 'PY', 'UT'])
    #
    #
    #     for row in list(sheet.rows)[2:100]:
    #         r = [c.value for c in row]
    #         r_min = [r[9],r[10],r[23],r[32],r[45],r[60]]
    #         print(r_min)
    #         new_sheet.append(r_min)
    #     new_wb.save('export.xlsx')

    def parse2(self):

        self.df.ExcelWriter('output.xls')





    def func1(self):
        df = self.df.toPandas()
        #print(df.head())
        plt.figure(figsize=(9, 6))
        plt.scatter(df['PY'], df['TC'], s=25, alpha=0.4, marker='o')
        # T:散点的颜色
        # s:散点的大小
        # alpha:是透明程度
        plt.show()


    def func2(self):
        df = self.df
        first_author_df = df.select('first_author','PY').groupBy('first_author').max('PY').withColumnRenamed('max(PY)','maxPY')

        self.sqlctx.registerDataFrameAsTable(df.drop('first_author'),'df')
        self.sqlctx.registerDataFrameAsTable(first_author_df,'fa')

        sql = "select first_author,TC from (fa outer join df on C1 like CONCAT('%',first_author,'%'))"

        join = self.sqlctx.sql(sql)
        join_rdd = join.rdd.map(lambda x:(x['first_author'],x['TC'])).reduceByKey(lambda x,y:x+'-'+y)

        # for r in join_rdd.collect():
        #     print(r)

        def m_h(x):
            flag = False
            h = 0
            cts = [int(x) for x in x[1].split('-')]
            cts.sort(reverse=True)
            for i in range(1, len(list(cts))+1):
                if i >= cts[i-1]:
                    flag = True
                    h = i # TODO or cts[i-1]
                    break

            if flag:
                return [(x[0],h),]
            else:
                return []

        author_h_rdd = join_rdd.flatMap(m_h)
        author_h_df = self.sqlctx.createDataFrame(author_h_rdd,['first_author','h'])
        final_df = author_h_df.join(first_author_df,'first_author','left_outer').select('h','maxPY')
        pdf = final_df.toPandas()

        plt.figure(figsize=(9, 6))
        plt.scatter(pdf['maxPY'], pdf['h'], s=25, alpha=0.4, marker='o')
        # T:散点的颜色
        # s:散点的大小
        # alpha:是透明程度
        plt.show()
Пример #14
0
## DataFrame Operate
sales_rdc.filter(col('dc_id') == '772').show()  # filter by a Column
sales_rdc.filter((col('dc_id') == '772') & (
    col('item_first_cate_cd') == '1620')).show()  # filter by some Columns
sales_rdc.filter((col('dc_id') == '772')
                 & (col('item_first_cate_cd') == '1620')
                 & (col('total_sales') != 0)).show()

data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]]
data_df = spark.createDataFrame(data,
                                list('abc'))  # create a DF, with columns name
data_df2 = spark.createDataFrame(data)  # create a DF

data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]]

sqlContext.registerDataFrameAsTable(data_df2,
                                    "test_table")  # register a Tmp Table
test_data = spark.sql('select * from test_table')
# sqlContext.dropTempTable("test_table")

sqlContext.udf.register("stringLengthInt", lambda x: len(str(x)),
                        IntegerType())  # register a Function for SQL
sqlContext.registerFunction("stringLengthInt", lambda x: len(str(x)),
                            IntegerType())
sqlContext.sql("SELECT stringLengthInt('test') as len").show()
sqlContext.sql("SELECT stringLengthInt(a) as len from test_table ").show()

df_as1 = data_df.alias("df_as1")  # alias
df_as2 = data_df.alias("df_as2")
joined_df = df_as1.join(df_as2,
                        col("df_as1.a") == col("df_as2.a"), 'inner')  # 保留了全部列名
joined_df.select("df_as1.a", "df_as2.a", "df_as2.b", "df_as2.c").show()
Пример #15
0
#Defining the schema
ischema = StructType([
    StructField('station', StringType(), False),
    StructField('date', StringType(), False),
    StructField('observation', StringType(), False),
    StructField('value', IntegerType(), False),
    StructField('useless', StringType(), False),
    StructField('quality_flag', StringType(), False)
])

#Reading the csv file
df = sqlContext.read.format('com.databricks.spark.csv').load(
    inputs, schema=ischema).cache()

#Registering table all_weather_data
sqlContext.registerDataFrameAsTable(df, "all_weather_data")

#Filtering TMAX and TMAX values and renaming the columns as min_temp and max_temp
min_temp = sqlContext.sql("""
            SELECT date, station, value as min_temp
            FROM all_weather_data
            WHERE observation="TMIN" AND quality_flag=""
            """)
sqlContext.registerDataFrameAsTable(min_temp, "min_temp")

max_temp = sqlContext.sql("""
            SELECT date, station, value as max_temp
            FROM all_weather_data
            WHERE observation="TMAX" AND quality_flag=""
            """)
sqlContext.registerDataFrameAsTable(max_temp, "max_temp")
Пример #16
0
    conf.set("spark.driver.maxResultSize", "10g")

    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # path to hillary/enron avro
    enr = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
    hil = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/hillary/*.avro").repartition(16)

    # register tables
    sqlContext.registerDataFrameAsTable(hil, "hillary")
    sqlContext.registerDataFrameAsTable(enr, "enron")

    # register udf
    sqlContext.registerFunction(
        "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
    )

    # do the cosine similarity on the text, get the top 1000 matches
    out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
                         "e.contents e_mail, h.contents h_mail, "
                         "getCos(e.contents, h.contents) as cos_sim "
                         "from hillary as h join enron as e order by cos_sim "
                         "desc limit 1000")

    # write back out to s3