def SPARKreadFile(sc): basedir = os.getcwd() filename = os.path.join(basedir,'FAK53004_ae6e213fd4e39f25ca87bf1c770b24c891782abc_0.fastq') sc.setLogLevel("WARN") #file = open(SparkFiles.get(filename)) file = sc.textFile(filename) list = file.take(file.count()) dict = namedtuple('SEQUENCE', ['NUMBER','ID', 'SEQ', 'OP', 'QUAL']) DFs = [] dict_ID = [] dict_SEQ = [] dict_OP = [] dict_QUAL = [] counter = 0 for i, v in enumerate(list): if (i%4 == 0): dict_ID.append(v) if (i%4 == 1): dict_SEQ.append(v) if (i%4 == 2): dict_OP.append(v) if (i%4 == 3): dict_QUAL.append(v) df = dict(NUMBER = counter, ID = dict_ID[counter], SEQ = dict_SEQ[counter], OP = dict_OP[counter], QUAL = dict_QUAL[counter]) DFs.append(df) counter +=1 rdd = sc.parallelize(DFs) seqDF = rdd.map(lambda x: Row(NUMBER = x[0], ID=x[1], SEQ=x[2], OP=x[3], QUAL=x[4])) schemaSeqDF = sqlContext.createDataFrame(seqDF) #file.close() return schemaSeqDF
def HLalignment(a, alignments, tab, Aligner,sc): dict = ReadFile.readFile3() # counter = 0 for name, seq, qual in dict.values(): try: hit = next(a.map(seq, MD=True, cs=True)) # dict = {} flag = 0 if hit.strand == 1 else 16 seq = seq if hit.strand == 1 else seq.translate(tab)[::-1] clip = ['' if x == 0 else '{}S'.format(x) for x in (hit.q_st, len(seq) - hit.q_en)] if hit.strand == -1: clip = clip[::-1] cigar = "".join((clip[0], hit.cigar_str, clip[1])) alignment = Aligner(contig=hit.ctg, Rname=name, flag=flag, pos=hit.r_st, mapq=hit.mapq, cigar=cigar, seq=seq, is_primary=hit.is_primary, MDtag=hit.MD, cstag=hit.cs, basequal=qual) # dict['counter','Qname', 'flag', 'Rname', 'pos', 'mapq', 'cigar', 'seq', 'is_primary'] = name, flag, hit.ctg, hit.r_st, hit.mapq, hit.cigar_str, seq, hit.is_primary if hit.mapq >= 10: # alignments.append(dict['counter','Qname', 'flag', 'Rname', 'pos', 'mapq', 'cigar','seq', 'is_primary']) alignments.append(alignment) # counter += 1 except StopIteration: alignment = Aligner(contig='chr0', Rname=name, flag=4, pos=None, mapq=None, cigar=None, seq=seq, is_primary=False, MDtag=None, cstag=None, basequal=qual) alignments.append(alignment) rdd = sc.parallelize(alignments) seqDF = rdd.map(lambda x: Row(contig=x[0], Rname=x[1], flag=x[2], pos=x[3], mapq=x[4], cigar=x[5], seq=x[6], is_primary=x[7], MDtag=x[8], cstag=x[9], basequal=x[10])) DF = sqlContext.createDataFrame(seqDF) return DF
def Sparkseeds(dict, i, k, hashDF, sc): word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j) for j in range(0, len(dict[i]) - k)] rddW = sc.parallelize(word) schemaWordDF = rddW.map( lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2])) df = sqlContext.createDataFrame(schemaWordDF) reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner') reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN) my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ) reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window)) reDF = reDF.withColumn( "dist", F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value), 0).otherwise(reDF.POS_SEQ - reDF.prev_value)) reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist, reDF.POS_GEN) reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window)) elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50)) & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50))) reDF = reDF.subtract(elDF) reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN) #pos = function(reDF) return reDF
def test_read_dataframe_with_path(self): paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30", "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"] dataframe = self.dataFrame.read_dataframe(paths=paths) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe)
def pandas_to_spark(pandas_df): columns = list(pandas_df.columns) types = list(pandas_df.dtypes) struct_list = [] for column, typo in zip(columns, types): struct_list.append(define_structure(column, typo)) p_schema = StructType(struct_list) return sqlContext.createDataFrame(pandas_df, p_schema)
def test_read_dataframe_with_path_retrieving_partition_name(self): paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30", "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"] dataframe = self.dataFrame.read_dataframe(paths=paths, options={'basePath': self.path}) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe) self.assertTrue("cutoff_date" in dataframe.schema.names)
def test_read_dataframes_with_date_range(self): dataframe = self.dataFrame.read_dataframes(self.path, process_date=["2020-05-31", "2020-07-31"], options={"basePath": self.path}) empty_dataframe = sqlContext.createDataFrame([], StructType([])) dates = dataframe.select("cutoff_date").dropDuplicates().collect() expected_dates = [Row(cutoff_date=datetime.date(2020, 7, 31)), Row(cutoff_date=datetime.date(2020, 5, 31)), Row(cutoff_date=datetime.date(2020, 6, 30))] self.assertNotEqual(dataframe, empty_dataframe) self.assertEqual(dates, expected_dates)
def __init__(self, scoreAndLabels, sc): df = sqlContext.createDataFrame(scoreAndLabels, schema=StructType([ StructField("score", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False) ])) java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics java_model = java_class(df._jdf) super(BinaryClassificationMetrics, self).__init__(java_model)
def get_spark_df(self, df): self.df = df meta = self.get_pdf_column_meta(self.df.columns) struct_list = [] for x in meta: # tpe = col_attr(meta, str(x)) tpe = [ str(meta.get(x).get(self.dtypeHeader)), str(meta.get(x).get(self.actualHeader)) ] struct_list.append(self.define_structure(x, tpe[0], tpe[1])) p_schema = StructType(struct_list) return sqlContext.createDataFrame(self.df, p_schema)
def save_data(rdd): global flag flag = False """ Parsing JSON value in each RDDs Creating Spark SQL DataFrame from RDD Writing DataFrame to HDFS and Oracle DB """ if not rdd.isEmpty(): rdd = rdd.map(lambda m: parse(m[1])) df = sqlContext.createDataFrame(rdd) df.createOrReplaceTempView("t") result = spark.sql( '''select event_id, event_type from (select row_number() over (partition by _1 order by _2) as RN, _1 as event_id,_2 as event_type from t) where RN = 1''') count = result.count() try: # Writing to HDFS result.write \ .format("csv") \ .mode("append") \ .option("header", "true") \ .save(HDFS_OUTPUT_PATH) # Writing to Oracle DB result.write \ .format("jdbc") \ .mode("append") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save() write_log('INFO', 'Consumer_dim_event_type.py', 'main', '{} rows inserted successfully'.format(count)) except Exception as e: print('--> It seems an Error occurred: {}'.format(e)) write_log('ERROR', 'Consumer_dim_event_type.py', 'main', str(e)[:1000]) flag = True else: ssc.stop() return rdd
def best_choice(dict, i, PG, seedArray, genome, sc): SC = [] for z in range(len(PG)): for pos_gen in PG[z]: seq = (dict[i], genome[pos_gen - seedArray[z]: pos_gen - seedArray[z] + len(dict[i])], seedArray[z], pos_gen) SC.append(seq) rddSeq = sc.parallelize(SC) schemaSeqDF = rddSeq.map(lambda x: Row(SEQ=x[0], GEN=x[1], POS_SEQ=x[2], POS_GEN=x[3])) df = sqlContext.createDataFrame(schemaSeqDF) df = df.withColumn("dist", F.levenshtein(F.col("SEQ"), F.col("GEN"))) val = (1 / float(len(dict[i]))) * 100 df = df.withColumn("percentage", val*F.col( "dist")).drop("dist") minDF = df.agg(min(col("percentage")).alias("percentage")) min_percentage = [x["percentage"] for x in minDF.rdd.collect()] df = df.filter(df.percentage == min_percentage[0]) return df,min_percentage
def write_offset_ranges(rdd): """ Writing value of untilOffset to DB for offsets :param untilOffset: Exclusive ending offset. """ if flag != True: for o in offsetRanges: currentOffset = int(o.untilOffset) df_write_offsets = sqlContext.createDataFrame([{"OFFSET": currentOffset}]) df_write_offsets.write \ .format("jdbc") \ .mode("overwrite") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", OFFSET_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save()
def convert_list_to_df(spark: SparkSession, table: set, schema: StructType, table_name: str) -> (DataFrame, str): """Converting tuple of table (list type), schema, table_name to tuple table (DataFrame type), table_name :param spark: SparkSession :param table: list - Data from the table :param schema: StructType - Schema of the table :param table_name: str - Table name :return: Table in format DataFrame with table name (str type) """ table: list = convert_set_to_list(table) print( "Converting content of the \"{}\" table from List to DataFrame format..." .format(table_name)) sc: SparkContext = spark.sparkContext rdd = sc.parallelize(table) df: DataFrame = sqlContext.createDataFrame(rdd, schema) return df, table_name
def save_data(rdd): """ Function for saving data in window """ global NAMES if not rdd.isEmpty(): # parsing data in RDD rdd = rdd \ .map(lambda x: parser.parse(x[1])) \ .map(lambda data: collect(data)) \ .reduceByKey(lambda rec1, rec2: max(rec1, rec2, key=last_record)) NAMES = dict(rdd.collect()) print( "************************************> NAMES <************************************" ) print(NAMES) print( "************************************> NAMES <************************************" ) rdd = rdd \ .map(lambda rec: (rec[0], rec[1][0], rec[1][1], rec[1][2])) # create DataFrame and View df = sqlContext.createDataFrame(rdd) df.createOrReplaceTempView("t") # query for getting result res = spark.sql( 'select t._1 as NAME, t._2 as COUNT_NAME, t._3 as AVG_TRAFFIC, t._4 as AVG_SUCCESS_SELL from t' ) res.show(40) # res.printSchema() # res = spark.sql('select count(*) KEY, sum(t._2) VALUE from t') res \ .write \ .format("jdbc") \ .mode("overwrite") \ .option("driver", 'oracle.jdbc.OracleDriver') \ .option("url", "jdbc:oracle:thin:@{0}:{1}:orcl".format(IP_DB, PORT_DB)) \ .option("dbtable", "tmp_kafka") \ .option("user", "kozyar") \ .option("password", "usertest") \ .save() # spark.catalog.dropTempView("t") return rdd
sqlContext.setConf("spark.sql.parquet.binaryAsString", "true") df = sqlContext.sql("SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw") hotline = sqlContext.read.text('/data/resources/numlist.txt') global hotline_list hotline_list = hotline.map(lambda x: x.value).collect() df1 = df.select('year', 'month').groupBy('year', 'month').count() partionList = df1.select('year', 'month').collect() temp = True for i, x in enumerate(partionList): print(x) if (x.year and x.month): temp_df = sqlContext.sql( "SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw WHERE year='{}' AND month='{}'" .format(str(x.year), str(x.month))) df2 = sqlContext.createDataFrame(temp_df.map(parse), [ 'number', 'number2', 'type', 'date', 'week', 'callduration', 'iscompethot' ]) if (temp): temp = False print("Overwriting") df2.filter( df2.type != 'invalid').write.mode("overwrite").saveAsTable( "cdr_step0", format="parquet", path="/data/intermediate_data/cdr_step0/") else: print("Appending") df2.filter( df2.type != 'invalid').write.mode("append").saveAsTable( "cdr_step0", format="parquet",
# print("Partitions structure: {}".format(rdd_1.glom().collect()), "\n") # # # Creation of RDD from another RDD thru transformation map() method (add value 4 to each number) # rdd_00 = rdd_0.map(lambda x : x+4) # print(rdd_00.toDebugString()) # print("Number of partitions: {}".format(rdd_00.getNumPartitions())) # print("Partitions structure: {}".format(rdd_00.glom().collect()), "\n") # Creation of RDD from a panda's DF # Create Pandas DataFrame from a dictionary data = {"Name":['Roshan', 'Hossam', 'Bala', 'Marcel', 'Deepak'], "Membership Due":[100, 200, 300, 400, 500]} pandasDf0 = pd.DataFrame(data) print(" Here is the Pandas DF created form a dictionary \n {}".format(pandasDf0.head())) # Now convert the Panda's DF to Spark DF accommodate with 7 partitions using sqlContext.createDataFrame(df) method sparkDf0 = sqlContext.createDataFrame(pandasDf0).repartition(7) rdd_000 = sparkDf0.rdd.map(list) print("\n Number of partitions: {}".format(rdd_000.getNumPartitions())) print("\n Converted RDD rdd_000 (data shown inside partitions): \n {}".format(rdd_000.glom().collect())) print("\n Converted RDD rdd_000 (data shown as a flat list): \n {}".format(rdd_000.collect())) # # get the RDD Lineage # print(rdd_1.toDebugString(), "\n") # # # add value 20 each number # rdd_2 = rdd_1.map(lambda x : x+20) # # # RDD Object # print(rdd_2) #
def save_data(rdd): global flag flag = False """ Parsing JSON value in each RDDs Creating Spark SQL dataFrame from RDD Writing dataFrame to HDFS and Oracle DB """ if not rdd.isEmpty(): # Create df for duplicate handling write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Executing max_id') df_max_id = spark.read \ .format("jdbc") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", "(SELECT max(ID) ID from " + TARGET_DB_TABLE_NAME + ")") \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .load() max_id = df_max_id.agg({'ID': 'max'}).collect()[0][0] if max_id == None: max_id = 0 write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Max id executed successfully max_id = {}'.format(max_id)) rdd = rdd.map(lambda m: parse(m[1])) df_fct_prod = sqlContext.createDataFrame(rdd) df_fct_prod.createOrReplaceTempView("t") result = spark.sql( '''select id, event_id, event_time, product_id, customer_id from (select row_number() over (partition by _1 order by _3) as RN, _1 as id,_2 as event_id, to_timestamp(_3) as event_time,_4 as product_id,_5 as customer_id from t where _1 > ''' + str(max_id) + ''') where RN = 1''') count = result.count() try: write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Consumer is inserting {} rows to DB'.format(count)) # Writing to HDFS result.write \ .format("csv") \ .mode("append") \ .option("header", "true") \ .save(HDFS_OUTPUT_PATH) # Writing to Oracle DB result.write \ .format("jdbc") \ .mode("append") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save() write_log('INFO', 'Consumer_fct_prod.py', 'main', '{} rows inserted to DB successfully'.format(count)) except Exception as e: print('--> It seems an Error occurred: {}'.format(e)) write_log('ERROR', 'Consumer_fct_prod.py', 'main', str(e)[:1000]) flag = True else: ssc.stop() return rdd
.options(delimiter=',', header=True, inferSchema=False) \ .schema(songs2tracks_schema) \ .load(songs2tracks_file) metadata_df = sqlContext.read.format('com.databricks.spark.csv') \ .options(delimiter=',', header=True, inferSchema=False) \ .schema(metadata_schema) \ .load(metadata_file) # change ids from strings to integers userId_change = plays_df.select('userId').distinct().select( 'userId', F.monotonically_increasing_id().alias('new_userId')) user_als_id_LUT = sqlContext.createDataFrame( userId_change.rdd.map(lambda x: x[0]).zipWithIndex(), StructType([ StructField("userId", StringType(), True), StructField("user_als_id", IntegerType(), True) ])) songId_change = plays_df.select('songId').distinct().select( 'songId', F.monotonically_increasing_id().alias('new_songId')) song_als_id_LUT = sqlContext.createDataFrame( songId_change.rdd.map(lambda x: x[0]).zipWithIndex(), StructType([ StructField("songId", StringType(), True), StructField("song_als_id", IntegerType(), True) ])) # RUN BELOW TWO LINES TO CHECK IF THE NEW USER_ID, SONG_ID GENERATED PROPERLY # user_als_id_LUT.show(5)
from pyspark.shell import sqlContext from pyspark.sql import * names = ["Alice", "Bob", "Mike"] items = ["milk", "bread", "butter", "apples", "oranges"] df = sqlContext.createDataFrame([(names[i % 1], items[i % 3]) for i in range(100)], ["name", "item"]) df.stat.crosstab("name", "item").show()
def test_read_dataframe(self): dataframe = self.dataFrame.read_dataframe(self.path) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe)
# binin.close() # binout = open('hash1.bin','wb' ) # data = pickle.dumps(ht1) # binout.write(data) # binout.close() #300000 binin = open('hash1.bin', 'rb') ht1 = pickle.load(binin) binin.close() #================================================================== rdd = sc.parallelize(ht1.items()) schemaHashDF = rdd.map(lambda x: Row(ID_GEN=x[0], POS_GEN=x[1])) hashDF = sqlContext.createDataFrame(schemaHashDF) #hashDF.show() print('\033[1m' + 'ALLINEAMENTO CON UTILIZZO DI SPARK:' + '\033[0m') startS = datetime.now() SparkAligner.alignerSpark(dict, genome, hashDF, sc, dict_map) endS = datetime.now() print('\033[1m' + 'TEMPO CON SPARK: ' + '\033[0m', endS - startS) print( "======================================================================================================================================================" ) # print ('\033[1m' + 'ALLINEAMENTO SENZA UTILIZZO DI SPARK:' + '\033[0m') # start = datetime.now() # Aligner.aligner(dict, genome, ht) # end = datetime.now()
# query the website and return the html to the variable ‘page’ page = requests.get(quote_page).text soup = BeautifulSoup(page, 'html.parser') # print(soup.prettify()) # Table for 2019-estimate, 2016 land area KM, 2016 population density extracted (rank ascending) tab = soup.find('div', {'class': 'mw-parser-output'}) tab = tab.find('table', {'class': 'wikitable sortable'}) tab = tab.find('tbody') tab = tab.findAll('tr') city_pop_us_tab = [] ans = [] # got completed table needed for tr in tab: city_pop_us_tab = tr.text.strip() city_pop_us_tab = city_pop_us_tab.replace(u'\xa0', u' ') city_pop_us_tab = city_pop_us_tab.split('\n') ans.append(city_pop_us_tab) # print(city_pop_us_tab) # print('+++++++++++++++++') ans.remove(ans[0]) rdd = spark.sparkContext.parallelize(ans) # print(rdd.collect()) dict_dataframe = sqlContext.createDataFrame(rdd, ['rank', ' ', 'city', '', 'state', '', '2019estimate', '', '2010censor', '', 'change', '', '2016land1', '', '2016land2', '', '2016pop1', '', '2016pop2']) dict_dataframe.show()
def ProcessData(pandaData, pipeline): sparkData = sqlContext.createDataFrame(pandaData) transformedData = pipeline.fit(sparkData).transform(sparkData) return transformedData
reduced = rdd.map(lambda row: ((row[2], row[3], row[6], row[7]), [(row[1], row[0])])) \ .reduceByKey(lambda x, y: x + y) \ .map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))).filter(lambda row: len(row[1]) == 2) \ .map(lambda row: (row[1][0][1], row[1][1][1], row[0][0], row[0][1], row[0][2], row[0][3])) schema_red = typ.StructType([ typ.StructField('Start Date', typ.StringType(), False), typ.StructField('End Date', typ.StringType(), False), typ.StructField('Private IP', typ.StringType(), False), typ.StructField('Private Port', typ.StringType(), False), typ.StructField('Destination IP', typ.StringType(), False), typ.StructField('Destination Port', typ.StringType(), False) ]) df_red = sqlContext.createDataFrame(reduced, schema_red) df_red.show() """ Output +-------------------+-------------------+--------------+------------+--------------+----------------+ | Start Date| End Date| Private IP|Private Port|Destination IP|Destination Port| +-------------------+-------------------+--------------+------------+--------------+----------------+ |22-02-2016 12:40:59|22-02-2016 12:42:04|100.68.154.175| 112| 216.58.197.77| 0| |22-02-2016 12:41:07|22-02-2016 12:43:16|100.77.186.232| 38867| 100.1.200.99| 8080| |22-02-2016 12:41:08|22-02-2016 12:43:18|100.68.154.175| 11882| 59.144.144.99| 53| |22-02-2016 12:41:12|22-02-2016 12:43:21|100.77.186.232| 38875| 100.1.200.99| 8080| |22-02-2016 12:41:17|22-02-2016 12:43:26|100.76.103.155| 35050| 59.144.144.99| 53| |22-02-2016 12:41:17|22-02-2016 12:43:26|100.77.186.232| 38880| 100.1.200.99| 8080| |22-02-2016 12:41:17|22-02-2016 12:43:26|100.77.186.232| 38881| 100.1.200.99| 8080| |22-02-2016 12:41:18|22-02-2016 12:43:28|100.77.186.232| 38885| 100.1.200.99| 8080| |22-02-2016 12:41:22|22-02-2016 12:43:31|100.77.186.232| 38889| 100.1.200.99| 8080|
pandas_df = qGET.to_pandas() print(pandas_to_spark(pandas_df)) columns = list(pandas_df.columns) print('get columns', columns) types = list(pandas_df.dtypes) print('get types', types) struct_list = [] for column, typo in zip(columns, types): struct_list.append(define_structure(column, typo)) p_schema = StructType(struct_list) print('get p_schema', p_schema) spark_DF = sqlContext.createDataFrame(pandas_df, p_schema) print('get spark_DF', spark_DF) ''' # Connect to sqlite3 database conn = sqlite3.connect("pythonsqlite.db") cur = conn.cursor() # look dataset in quandl & transfter to pandas df = pandaset().lookpandaset('WIKI/AAPL') # pandas dataFrame store to Sqlite3 df.to_sql("daily_flights", conn, if_exists="replace") pd_daily_flights = pd.read_sql_query("select * from daily_flights limit 10;", conn) print('daily_flights', pd_daily_flights) getdata = quandl.get('FRED/GDP', start_date='2010-01-01', end_date='2014-01-01',
# Separating catagorical and numerical columns Id_col = ['customerID'] target_col = ["Churn"] cat_cols = pandasData.nunique()[pandasData.nunique() < 6].keys().tolist() cat_cols = [x for x in cat_cols if x not in target_col] num_cols = [ x for x in pandasData.columns if x not in cat_cols + target_col + Id_col ] # labels lab = pandasData["Churn"].value_counts().keys().tolist() # values val = pandasData["Churn"].value_counts().values.tolist() spark_df = sqlContext.createDataFrame(pandasData) spark_df.show def func(pct, allvals): absolute = int(pct / 100. * np.sum(allvals)) return "{:.1f}%".format(pct, absolute) def churnPlot(): fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal")) wedges, texts, autotexts = ax.pie(val, autopct=lambda pct: func(pct, val), textprops=dict(color="w")) ax.legend(wedges, lab, title="Légende", loc="center left",
def save_data(rdd): global flag flag = False """ Parsing JSON value in each RDDs Creating Spark SQL DataFrame from RDD Writing DataFrame to HDFS and Oracle DB """ if not rdd.isEmpty(): # Create df for duplicate handling df_max_id = spark.read \ .format("jdbc") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .load() max_id = df_max_id.agg({'product_id': 'max'}).collect()[0][0] if max_id == None: max_id = 0 rdd = rdd.map(lambda m: parse(m[1])) df = sqlContext.createDataFrame(rdd) df.createOrReplaceTempView("t") result = spark.sql( '''select product_id, category_id, brand, description, name, price, last_update_date from (select row_number() over (partition by _1 order by _7) as RN,_1 as product_id,_2 as category_id, _3 as brand,_4 as description,_5 as name,_6 as price,to_timestamp(_7) as last_update_date from t where _1 > ''' + str(max_id) + ''') where RN = 1''') count = result.count() try: # Writing to HDFS result.write \ .format("csv") \ .mode("append") \ .option("header", "true") \ .save(HDFS_OUTPUT_PATH) # Writing to Oracle DB result.write \ .format("jdbc") \ .mode("append") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save() write_log('INFO', 'Consumer_dim_products.py', 'main', '{} rows inserted successfully'.format(count)) except Exception as e: print('--> It seems an Error occurred: {}'.format(e)) write_log('ERROR', 'Consumer_dim_products.py', 'main', str(e)[:1000]) flag = True else: ssc.stop() return rdd
'CITY': 'Amsterdam' }, index=[0]) #pd_person = pd.DataFrame({'ADDRESS':'Museumplein','CITY':'Amsterdam','FIRSTNAME':'John','LASTNAME':'Doe','PERSONID':'0'}, index=[0]) #Create PySpark DataFrame Schema p_schema = StructType([ StructField('ADDRESS', StringType(), True), StructField('CITY', StringType(), True), StructField('FIRSTNAME', StringType(), True), StructField('LASTNAME', StringType(), True), StructField('PERSONID', StringType(), True) ]) #Create Spark DataFrame from Pandas df_person = sqlContext.createDataFrame(pd_person, p_schema) #Important to order columns in the same order as the target database df_persons = df_person.select("PERSONID", "LASTNAME", "FIRSTNAME", "CITY", "ADDRESS") spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate() df_persons.createOrReplaceTempView("DimSalary") spark.sql("select * from DimSalary").show() ''' spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate() quandl.ApiConfig.api_key = 'P6LZzSkdVN6zTXQDE6Pd' qGET = quandl.Dataset('NSE/OIL').data() df = qGET.to_pandas() df_pd = pd.DataFrame(df)
def pretty_print_pagerank(graphframes, google): """ Prints a pretty chart with Google, Graphframes, and the Deltas """ # Divide by total to match Google print("+-------+---------------------+") print("|Google\t|GraphFrames\t|Delta|") print("+-------+---------------------+") for key in google: goog = google[key] g_frames = graphframes_pagerank[key] print("|{}\t|{:.3f}\t\t|{:.3f}|".format(goog, g_frames, abs(goog - g_frames))) print("+-------+---------------------+") """ ## Create some edges and vertices to match Fig 2.1 in the paper """ vertices = sqlContext.createDataFrame([(1, ), (2, ), (3, ), (4, )], ["id"]) edges = sqlContext.createDataFrame([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 1), (4, 1), (4, 3)], ["src", "dst"]) graph = GraphFrame(vertices, edges) """ ## Show Vertices """ display_graph(graph.vertices) doc.show() """ ## Show Edges """ display_graph(graph.edges) doc.show() """ ## Show Degrees (Sum of in and out degrees by node) """ display_graph(graph.degrees) doc.show() """ Show all motifs which satisfy a->b->c """
def test_read_dataframes(self): dataframe = self.dataFrame.read_dataframes(self.path, partition_number=1) empty_dataframe = sqlContext.createDataFrame([], StructType([])) self.assertNotEqual(dataframe, empty_dataframe)