예제 #1
0
def main():
    """
    :return: Place and magnitude, where magnitude is greater than 1.0.
    """
    start = time.time()

    data = os.path.join(root, path)
    df = sqlContext.read.json(data)
    df.createOrReplaceTempView('earthquakes')
    earthquakes_df = sqlContext.sql("SELECT properties.mag, properties.place "
                                    "FROM earthquakes "
                                    "WHERE properties.mag > 1.0")
    earthquakes_df.show()

    end = time.time()
    print('Time spent', end - start, 'seconds')
예제 #2
0
def results(query_def):
    """Query execution logic """
    # Read JSON
    data_frame = sc.read.json("dataset/tweetsdata_v1.json")
    data_frame.createOrReplaceTempView("tweetDatatable")
    
    query = sqlContext.sql(query_def)
    # Getting contents of df as Pandas 
    data_frame = query.toPandas()
    # data_frame_dropna = data_frame.dropna()
    # return data_frame_dropna.to_html()
    # Display results in HTML
    #data_frame = pd.DataFrame(np.random.randn(20, 5))
    
   # return data_frame.to_html(classes = 'styles')
   #myhtml = data_frame.style.set_properties(**{'font-size': '11pt', 'font-family': 'Calibri','border-collapse': 'collapse','border': '1px solid black'}).render()

#with open('myhtml.html','w') as f:
    #f.write(myhtml)

    return data_frame.to_html()
예제 #3
0
def sql(sql):
    sqlContext.sql(sql).show(n=20, truncate=False)
예제 #4
0
def saveCsv(name, sql):
    sqlContext.sql(sql).repartition(1).write.format("csv").option(
        "header", "false").save(name)
예제 #5
0
def saveResults(name, sql):
    sqlContext.sql(sql).repartition(1).write.mode('overwrite').parquet(name)
예제 #6
0
FROM (
    SELECT url, filter(subresources, s -> s.integrity IS NOT NULL) AS subresources 
    FROM cc 
    WHERE size(filter(subresources, s -> s.integrity IS NOT NULL)) > 0
) LATERAL VIEW explode(subresources) T AS sri
GROUP BY host, target
ORDER BY sri DESC
""")

# ---------------------------
# 07: What is the number of elements per target protocol?

select = sqlContext.sql("""
SELECT 
    url as host,
    sri.target as target
FROM cc LATERAL VIEW explode(subresources) T AS sri
WHERE sri.integrity IS NOT NULL
""")


def parse(r):
    h = urlparse(r.host)
    t = urlparse(urljoin(r.host, r.target))
    return ((h.scheme, t.scheme, 'l' if h.netloc == t.netloc else 'r'), 1)


select.rdd.map(parse).reduceByKey(add).toDF().repartition(1).write.mode(
    'overwrite').parquet("07_elements_per_protocol")

# ---------------------------
예제 #7
0
def sql(sql):
    return sqlContext.sql(sql).toPandas()
예제 #8
0
                             subjects={"math": 80, "english": 56},
                             enrolled=datetime(2014, 8, 1, 14, 1, 5)),
                         Row(id=2,
                             name="George",
                             active=False,
                             clubs=['chess', 'soccer'],
                             subjects={"math": 60, "english": 96},
                             enrolled=datetime(2015, 3, 21, 8, 2, 5)),
                         ])

record_df = record.toDF()
record_df.show()

record_df.createOrReplaceTempView("records")

all_records_df = sqlContext.sql('SELECT * FROM records')
all_records_df.show()

sqlContext.sql('SELECT id, clubs[1], subjects["english"] FROM records').show()

sqlContext.sql('SELECT id, NOT active from records').show()

sqlContext.sql('SELECT * FROM records where active').show()

sqlContext.sql('SELECT * FROM records where subjects["english"] > 90').show()

record_df.createGlobalTempView("global_records")

sqlContext.sql('SELECT * FROM global_temp.global_records').show()

예제 #9
0

def nafill(line):
    if (line):
        if (line.number_in):
            number = line.number_in
        else:
            number = line.number_out
    return (number, line.coefficiant_of_variance_in,
            line.coefficiant_of_variance_out, line.call_count_in,
            line.call_count_out, line.call_count_competitor_out)


if __name__ == '__main__':
    sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
    df = sqlContext.sql("SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw")
    hotline = sqlContext.read.text('/data/resources/numlist.txt')
    global hotline_list
    hotline_list = hotline.map(lambda x: x.value).collect()
    df1 = df.select('year', 'month').groupBy('year', 'month').count()
    partionList = df1.select('year', 'month').collect()
    temp = True
    for i, x in enumerate(partionList):
        print(x)
        if (x.year and x.month):
            temp_df = sqlContext.sql(
                "SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw WHERE year='{}' AND month='{}'"
                .format(str(x.year), str(x.month)))
            df2 = sqlContext.createDataFrame(temp_df.map(parse), [
                'number', 'number2', 'type', 'date', 'week', 'callduration',
                'iscompethot'
예제 #10
0
from pyspark.shell import sqlContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("name_app")\
    .config("spark.mongodb.input.uri","mongodb://192.168.1.96:27017/mydatabase.data")\
    .config("spark.mongodb.output.uri","mongodb://192.168.1.96:27017/mydatabase.data").getOrCreate()
data = spark.createDataFrame([("Bilbo Baggins", 50), ("Gandalf", 1000),
                              ("Thorin", 195), ("Balin", 178), ("Kili", 77),
                              ("Dwalin", 169), ("Oin", 167), ("Gloin", 158),
                              ("Fili", 82), ("Bombur", None)], ["name", "age"])
#data.write.format("com.mongodb.spark.sql.DefaultSource").option("database","mydatabase").option("collection","data").save()
data.show()
data.registerTempTable("mytable")
res = sqlContext.sql("select * from mytable limit 5")
res.show()