"""SimpleApp1.py""" from pyspark import SparkContext as sc logFile = "/home/tibil/Downloads/spark-2.4.4-bin-hadoop2.7/README.md" # Should be some file on your system sc = sc("local", "ps") logData = sc.textFile(logFile).cache() numAs = logData.filter(lambda s: 'a' in s).count() numBs = logData.filter(lambda s: 'b' in s).count() print("Lines with a: %i, lines with b: %i" % (numAs, numBs)) print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
image = np.concatenate(images, axis=0) return image # Choice of number of blocks being Blocks * Blocks Blocks = 8 # Threshold of the edge map T = 50 # Size of the filter and number to be extended filterSize = 3 numExt = (filterSize - 1) // 2 # getting an instance of spark context sc = sc() # Obtaining rdd through of hdfs hdfsDirectory = 'hdfs://localhost:9000/SampleImages/' rdd = sc.binaryFiles(hdfsDirectory + '*') # Decoding the images -- file_params (fileName, binary) rdd = rdd.map(lambda file_params: ( file_params[0], cv2.imdecode(np.asarray(bytearray(file_params[1]), dtype=np.uint8), 1))) # file_params (fileName, img) -> file_params (i, (fileName, img)) rdd = rdd.flatMap(lambda file_params: extendVertical(file_params)) # file_params (i, (fileName, img)) -> file_params ((i,j),(fileName, img)) rdd = rdd.flatMap(lambda file_params: extendHorizontal(file_params))
# # for row in result.collect(): # print(row) # #t, vector = row # #print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # #vec = model.transform("batman") #model if __name__ == "__main__": os.chdir("D:\\trentsemester2\\bigData\\the-movies-dataset") sparkconf = SparkConf().setAppName("movie").setMaster("local[*]") sparkcont = sc(conf=sparkconf) sparkcont.setLogLevel("ERROR") sqlContext = SQLContext(sparkcont) pandas_df = getMovieData() df = sqlContext.createDataFrame(pandas_df) movie_wordcloud(df) #story_df = pd.read_csv("movies_metadata.csv") getStory(pandas_df) top_25(df) print("Top 15 romance movies") build_chart(df=df, genre='Romance').limit(15).show() #word2vec_model() top10 = als_model(8, df) ls_sentiment_model = sentiment_model() predict_sentiment_df(top10, ls_sentiment_model) print("End of Project")
import findspark findspark.init() import time from pyspark import SparkContext as sc from pyspark import SparkConf from pyspark.sql import SparkSession as ss from pyspark.sql.types import * conf = SparkConf() conf.setMaster("spark://Sarthaks-MBP:7077").setAppName('IPL Analytics Job').set("spark.executor.memory", "512m") spark = sc(conf=conf) a = spark.textFile("Dataset/*.csv").map(lambda line: line.split(",")).filter(lambda line: line[0].strip()=="ball").collect() player_vs_player={} for line in a: details = line if details[0].strip()=='ball': players1=(details[4],details[6]) players2=(details[5],details[6]) if players1 in player_vs_player.keys(): player_vs_player[players1]['total']+=int(details[7]) player_vs_player[players1]['runs'][int(details[7])]+=1 player_vs_player[players1]['balls']+=1 if details[9]!='""' and details[9]!='run out' and players1[0].strip()==details[10].strip(): player_vs_player[players1]['wickets']+=1 else: player_vs_player[players1]={} player_vs_player[players1]['total']=int(details[7]) player_vs_player[players1]['runs']=[0,0,0,0,0,0,0,0] player_vs_player[players1]['wickets']=0 player_vs_player[players1]['balls']=1 if details[9]!='""' and details[9]!='run out' and players1[0].strip()==details[10].strip():
import pyspark import sys assert sys.version_info >= (3, 5) from pyspark.sql import SparkSession, functions, types def main(input): business = spark.read.json(input).repartition(80) split_col = functions.split(business['categories'], ',') business = business.withColumn("categories",split_col).filter(business["city"] != "").dropna() business.createOrReplaceTempView("business") b_etl = spark.sql("SELECT business_id, name, city, state, latitude, longitude, stars, review_count, is_open, categories, attributes FROM business").cache() b_etl.createOrReplaceTempView("b_etl") outlier = spark.sql("SELECT b1.business_id, SQRT(POWER(b1.latitude - b2.avg_lat, 2) + POWER(b1.longitude - b2.avg_long, 2)) as dist FROM b_etl b1 INNER JOIN (SELECT state, AVG(latitude) as avg_lat, AVG(longitude) as avg_long FROM b_etl GROUP BY state) b2 ON b1.state = b2.state ORDER BY dist DESC") outlier.createOrReplaceTempView("outlier") joined = spark.sql("SELECT b.* FROM b_etl b INNER JOIN outlier o ON b.business_id = o.business_id WHERE o.dist<10") joined.write.parquet("yelp-etl/business_etl", mode = "overwrite") if __name__ == '__main__': data_path = os.getcwd()+"/yelp-dataset/" Business_filepath = data_path + 'yelp_academic_dataset_business.json' sc = sc(appName="Yelp") sqlContext = SQLContext(sc) spark = SparkSession.builder.appName('reddit average').getOrCreate() assert spark.version >= '2.3' main(Business_filepath)