def generate_parquet(feature_path, mask_path, output_path): """[summary] Generate parquet file with two columns - First column: npG_array representing image - Second column: np_array representing mask Arguments: feature_path {[type]} -- path to all images mask_path {[type]} -- path to masks of images output_path {[type]} -- parquet path """ from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.sql import Row from pyspark.sql.types import _infer_schema from pyspark.sql.functions import monotonically_increasing_id rowgroup_size_mb = 256 spark_conf = SparkConf().setAppName('Image preprocess') sc = SparkContext(conf=spark_conf) session = SparkSession(sc) # Load images and convert it to dataframe images_rdd = sc.binaryFiles(feature_path).values() image_flat_numpy_rdd = images_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \ .map(lambda pair_np_array_id: {'features': pair_np_array_id[0], 'id': pair_np_array_id[1]}) \ .map(lambda x: dict_to_spark_row(FeatureSchema, x)) image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) # .withColumn("id", monotonically_increasing_id()) # Generate table row id # Load masks and convert it to dataframe mask_rdd = sc.binaryFiles(mask_path).values().zipWithIndex() # Convert mask rgb value to 0 for not building and 1 for building mask_flat_numpy_rdd = mask_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \ .map(lambda pair_np_array_id: ((pair_np_array_id[0] / 255).astype(np.uint8), pair_np_array_id[1])) \ .map(lambda pair_std_np_array_id: {'masks': pair_std_np_array_id[0], 'id': pair_std_np_array_id[1]}) \ .map(lambda x: dict_to_spark_row(MaskSchema, x)) mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) #.withColumn("id", monotonically_increasing_id()) # Generate table row id mask_df.show(5, False) # Concat image_df and mask_df row by row train_df = image_df.join(mask_df, "id", "inner").drop('id') #print("Summary =>>>>>>>>>>>>>>>>>>>>>>>....>>>") #print("Image count {} , mask count {}, train_count {}".format(image_df.count(), mask_df.count(), train_df.count())) #print("=======================================") with materialize_dataset(session, output_path, TrainSchema, rowgroup_size_mb): train_df.write \ .mode('overwrite') \ .parquet(output_path)
def generate_parquet(feature_path, mask_path, output_path): """[summary] Generate parquet file with two columns - First column: np_array representing image - Second column: np_array representing mask Arguments: feature_path {[type]} -- path to all images mask_path {[type]} -- path to masks of images output_path {[type]} -- parquet path """ from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.sql import Row from pyspark.sql.types import _infer_schema from pyspark.sql.functions import monotonically_increasing_id rowgroup_size_mb = 256 spark_conf = SparkConf().setAppName('Image preprocess') sc = SparkContext(conf=spark_conf) session = SparkSession(sc) # Load images and convert it to dataframe images_rdd = sc.binaryFiles(feature_path) image_flat_numpy_rdd = images_rdd.values().map(raw_image_to_numpy_array) \ .map(lambda x: {'features': x}) \ .map(lambda x: dict_to_spark_row(FeatureSchema, x)) image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) \ .withColumn("id", monotonically_increasing_id()) # Generate table row id # Load masks and convert it to dataframe mask_rdd = sc.binaryFiles(mask_path) mask_flat_numpy_rdd = mask_rdd.values().map(raw_image_to_numpy_array) \ .map(lambda image_np_array: (image_np_array / 255).astype(np.uint8)) \ .map(lambda x: {'masks': x}) \ .map(lambda x: dict_to_spark_row(MaskSchema, x)) mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) \ .withColumn("id", monotonically_increasing_id()) # Generate table row id # Concat image_df and mask_df row by row train_df = image_df.join(mask_df, "id", "outer").drop("id") with materialize_dataset(session, output_path, TrainSchema, rowgroup_size_mb): train_df.write \ .mode('overwrite') \ .parquet(output_path)
def main(argv): logging.config.fileConfig( os.path.join(os.path.dirname(os.path.realpath(__file__)), "logging.ini")) parsed_args = parse_args(argv) spark_conf = SparkConf() sc = SparkContext(conf=spark_conf) with open(parsed_args.config) as in_config: preprocess_conf = json.load(in_config) if preprocess_conf.get("binary_input", True): files = sc.binaryFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000)) else: files = sc.wholeTextFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000)) files = files.repartition(preprocess_conf.get('partitions', 4000)) metadata = parse_metadata(preprocess_conf["labeled"]["metadata"]) labeled = sc.textFile(preprocess_conf["labeled"]["file"], preprocess_conf.get('partitions', 4000)).\ map(lambda x: parse_labeled_line(x, metadata, True)).filter(lambda x: x.iloc[0]["label"] != 4).map(transform_labels) header, resampled = prep.preprocess( sc, files, labeled, label=preprocess_conf.get('label', True), cut=preprocess_conf.get("cut", { "low": 6300, "high": 6700 }), pca=preprocess_conf.get("pca", None), partitions=preprocess_conf.get('partitions', 100)) resampled.map( lambda x: x.to_csv(None, header=None).rstrip("\n")).saveAsTextFile( preprocess_conf["output"])
def main(): conf = SparkConf().setAppName("binarize nifti") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') parser = argparse.ArgumentParser(description='Binarize images') parser.add_argument('threshold', type=int, help="binarization threshold") parser.add_argument('folder_path', type=str, help='folder path containing all of the splits') parser.add_argument('output_path', type=str, help='output folder path') parser.add_argument('num', type=int, choices=[2, 4, 6, 8], help='number of binarization operations') parser.add_argument('-m', '--in_memory', type=bool, default=True, help='in memory computation') args = parser.parse_args() nibRDD = sc.binaryFiles(args.folder_path)\ .map(lambda x: get_data(x)) client = Config().get_client('dev') if args.in_memory == 'True': print "Performing in-memory computations" for i in xrange(num - 1): nibRDD = nibRDD.map(lambda x: binarize(x, args.threshold)) nibRDD = nibRDD.map(lambda x: binarize_and_save( x, args.threshold, args.output_path, client)).collect() else: print "Writing intermediary results to disk and loading from disk" binRDD = nibRDD.map(lambda x: binarize_and_save( x, args.threshold, args.output_path + "1", client)).collect() for i in xrange(num - 1): binRDD = sc.binaryFiles(args.output_path + "1")\ .map(lambda x: get_data(x))\ .map(lambda x: binarize_and_save(x, args.threshold, args.output_path + "1", client)).collect()
def main(): sc = SparkContext(appName="tileMapper") print("I do all the input output jazz") ########################################################################### big_image = sc.binaryFiles("Reference/108103_sm.jpg") tile_avgs = big_image.flatMap(extract_opencv_tiles()) #buckets = tile_avgs.collect() #print("Bucket",buckets) tileMap = tile_avgs.map( lambda l: [item for sublist in l for item in sublist]) tileList = tileMap.collect() print("Tile Map", tileMap) print("Tile Map", tileMap.collect()) print("Tile List", tileList) print("Tile LIst", type(tileList)) ############################################################################ clusterIndex = getIndex() kmModel = KMeansModel.load(sc, "myModelPath") readyToCombine = [] currentRow = None noOfRow = 0 noOfCol = 0 firstTile = tileList[0] tileSize = firstTile[1] #Randomly Get small images using kmeans match for tile in tileList: if tile[0] == currentRow: smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]], tileSize, clusterIndex) readyToCombine.append(smallImg) noOfCol = noOfCol + 1 else: currentRow = tile[0] noOfCol = 1 noOfRow = noOfRow + 1 currentRow = tile[0] smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]], tileSize, clusterIndex) readyToCombine.append(smallImg) #Put small images into the big image canvas canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8) #Print Image print("No. of Col", noOfCol) print("No. of Row", noOfRow) #print("Before Print, Check Once again",readyToCombine) mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow, tileSize) print("Finished processing of image") cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)
def main(sujet): #conf = SparkConf() #conf.set("spark.executor.memory", "4g") #sc = SparkContext(conf=conf) sc = SparkContext() spark = SparkSession.builder.getOrCreate() # Load files #rdd = sc.binaryFiles('hdfs://localhost:9000/data-wiki/work/historique.avro') rdd = sc.binaryFiles(parm_histo) # Parse avro files nodes = rdd.flatMap(lambda args: fastavro.reader(BytesIO(args[1]))) # Convert to a resilient distributed dataset (RDD) of rows rows = nodes.map(lambda node: Row(**node)) # Convert to a Spark dataframe df = spark.createDataFrame(rows, samplingRatio=1) # Cache data to avoid re-computing everything df.persist() historique = df #liensql = spark.read.format("avro").load("hdfs://localhost:9000/data-wiki/work/pagesql.avro") liensql = spark.read.format("avro").load(parm_sql) # Récupération des contributeurs du sujet sel_historique = historique.filter(historique.title == sujet) title_historique = (sel_historique.first().title) id_historique = (sel_historique.first().id) dt = sel_historique.select(explode(sel_historique.contributors)).groupBy("col").count() # liens historiques précédents liensql_from = liensql.filter(liensql.page_title == sujet).join(historique, (historique.id == liensql.page_id)) df = liensql_from.select(explode(liensql_from.contributors)).groupBy("col").count() dtf = df.unionAll(dt).orderBy('count', ascending=False) # liens historiques suivants liensql_to = liensql.filter(liensql.page_id == id_historique).join(historique, (historique.title == liensql.page_title)) dt = liensql_to.select(explode(liensql_to.contributors)).groupBy("col").count() dc = dtf.unionAll(dt).orderBy('count', ascending=False) dc.createTempView("datasql") spark.sql("SELECT col as contributeur, count as score FROM datasql limit 3").show() print("Les meilleurs contributeurs pour le sujet wikipédia : " + sys.argv[3])
def main(args): conf = SparkConf().setMaster("local[4]").setAppName("transport") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) if args.environment == "local": input_path = "datasets/test-folder.small/*.zip" output_path = "./" elif args.environment == "cloud": input_path = "s3n://dtpm-transactions/test-folder.small/*.zip" output_path = "s3n://dtpm-transactions/parquet/" rdd = sc.binaryFiles(input_path).flatMap( lambda a: extract_files(a[0], a[1])) # Decode bytes and convert it in a list of strings rdd = rdd.mapValues( lambda file: BytesIO(file).read().decode('cp1252').split('\n')) # Drop header and last (and empty) row rdd = rdd.mapValues(lambda table: table[1:-1]) # Change type of columns rdd = rdd.flatMap(lambda a: prepare_csv(a[0], a[1])) header = [ 'FILE_NAME', 'FECHAHORATRX', 'CODIGOENTIDAD', 'NOMBREENTIDAD', 'CODIGOSITIO', 'NOMBRESITIO', 'NROTARJETA' ] header = list(map(lambda a: a.lower(), header)) df = rdd.toDF(header) days = [ file.file_name for file in df.select('file_name').distinct().collect() ] for directory in days: if not os.path.exists(directory): os.makedirs(directory) df_day = df.select(df.columns[1:]).where(df.file_name == directory) df_day.write.parquet(output_path + directory + "/data.parquet", compression="gzip")
def main(argv): logging.config.fileConfig(os.path.join(os.path.dirname(os.path.realpath(__file__)), "logging.ini")) parsed_args = parse_args(argv) spark_conf = SparkConf() sc = SparkContext(conf=spark_conf) with open(parsed_args.config) as in_config: preprocess_conf = json.load(in_config) if preprocess_conf.get("binary_input", True): files = sc.binaryFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000)) else: files = sc.wholeTextFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000)) files = files.repartition(preprocess_conf.get('partitions', 4000)) metadata = parse_metadata(preprocess_conf["labeled"]["metadata"]) labeled = sc.textFile(preprocess_conf["labeled"]["file"], preprocess_conf.get('partitions', 4000)).\ map(lambda x: parse_labeled_line(x, metadata, True)).filter(lambda x: x.iloc[0]["label"] != 4).map(transform_labels) header, resampled = prep.preprocess(sc, files, labeled, label=preprocess_conf.get('label', True), cut=preprocess_conf.get("cut", {"low": 6300, "high": 6700}), pca=preprocess_conf.get("pca", None), partitions=preprocess_conf.get('partitions', 100)) resampled.map(lambda x: x.to_csv(None, header=None).rstrip("\n")).saveAsTextFile(preprocess_conf["output"])
class SparkUtils: def __init__(self, master, app_name): if os.environ["pfe_env"] != "dev": self.sc = SparkContext(appName=app_name) self.sc.addFile('/FileProcessor.py') self.sc.addFile('/FileIndexProducer.py') self.sc.addFile('/FileIndexRepository.py') self.sc.addFile('/FileUrlProcessor.py') self.sc.addFile('/LdaTopicsDescriptionProducer.py') self.sc.addFile('/LdaTopicsDescriptionRepository.py') self.sc.addFile('/Parser.py') self.sc.addFile('/SparkProcessor.py') self.sc.addFile('/SparkUtils.py') self.sc.addFile('/TextMostCommonWordsExtractor.py') self.sc.addFile('/TextPreProcessor.py') self.sc.addFile('/TextSummarizer.py') self.sc.addFile('/thumbnail_temp.py') self.sc.addFile('/ThumbnailGenerator.py') self.sc.addFile('/NotificationConstants.py') self.sc.addFile('/RabbitMqConstants.py') else: self.sc = SparkContext(master=master, appName=app_name) self.sql_context = SQLContext(self.sc) # output rdd:(url, b'content") def read_files(self, path): return self.sc.binaryFiles(path) def rdd_to_df(self, rdd, schema): df = self.sql_context.createDataFrame(rdd, schema) return df def join_df(self, df0, df1, join_col, df0_selected_cols, df1_selected_cols): df0_selected_cols = ["df0."+x for x in df0_selected_cols] df1_selected_cols = ["df1."+x for x in df1_selected_cols] df0 = df0.alias('df0') df1 = df1.alias('df1') df = df0.join(df1, col("df0."+join_col) == col("df1."+join_col))\ .select(df0_selected_cols + df1_selected_cols) return df
def run(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') ds = os.getenv('DATA_SOURCE') conf = SparkConf() sc = SparkContext(conf=conf) # https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html # Authenticating with S3 sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', aws_access_key_id) sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', aws_secret_access_key) imgs = sc.binaryFiles(ds) imgs.foreach(\ partial(\ upload_img_to_s3,\ aws_access_key_id=aws_access_key_id,\ aws_secret_access_key=aws_secret_access_key\ )\ )
def main(): conf = SparkConf().setAppName("binarize nifti") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') parser = argparse.ArgumentParser( description='Binarize images using FSL installed in a Docker container' ) parser.add_argument('threshold', type=int, help="binarization threshold") parser.add_argument('folder_path', type=str, help='folder path containing all of the splits') parser.add_argument('output_path', type=str, help='output folder path') args = parser.parse_args() print args.folder_path client = Config().get_client('dev') nibRDD = sc.binaryFiles(args.folder_path)\ .map(lambda x: get_data(x))\ .map(lambda x: binarize(x, args.threshold))\ .map(lambda x: copy_to_hdfs(x, args.output_path, client)).collect()
from PIL import Image, ImageFilee import pyspark import numpy as np import pydoop.hdfs as hdfs import os import string import random import cv2 sc = SparkContext("spark://discus-p2irc-master:7077", "imageSharpening") #sc = SparkContext("local", "sharpenedImages") processing_start_time = time() images_rdd = sc.binaryFiles( 'hdfs://discus-p2irc-master:54310/user/hduser/landsat_images', 100) #images_rdd = sc.binaryFiles('file:///sparkdata/p2irc-images', 264) #ImageFile.LOAD_TRUNCATED_IMAGES = True #images_to_bytes = lambda rawdata: Image.open(StringIO(rawdata)).convert('RGB') def images_to_bytes(rawdata): ImageFile.LOAD_TRUNCATED_IMAGES = True return (rawdata[0], Image.open(StringIO(rawdata[1])).convert('RGB')) images_bytes = images_rdd.values().map(images_to_bytes) images_bytes.persist(pyspark.StorageLevel.MEMORY_AND_DISK) processing_end_time = time() - processing_start_time
import os from pyspark import SparkContext from pyspark.sql import SparkSession, Row from io import BytesIO import json import fastavro sc = SparkContext() spark = SparkSession.builder.getOrCreate() # Load files rdd = sc.binaryFiles('hdfs://localhost:9000/data/paris/master/full/*.avro') # (filename, content) # If it takes too long to process all files, you may want to reduce the number # of processed files. E.g: # rdd = sc.binaryFiles('hdfs://localhost:9000/data/paris/master/full/2.250182*.avro') # (filename, content) # Parse avro files nodes = rdd.flatMap(lambda args: fastavro.reader(BytesIO(args[1]))) # Convert to a resilient distributed dataset (RDD) of rows rows = nodes.map(lambda node: Row(**node)) # Convert to a Spark dataframe df = spark.createDataFrame(rows) # Cache data to avoid re-computing everything df.persist() print("There are %d nodes in the dataset" % df.count())
from pyspark import SparkConf from StringIO import StringIO from PIL import Image import numpy as np import os, tempfile import datetime sc = SparkContext() # AWS S3 credentials: AWS_KEY = "" AWS_SECRET = "" sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET) directory = 's3n://amlyelp/subset/trainnew/' images = sc.binaryFiles(directory) image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata))) image_array = images.map(lambda x: (x[0], image_to_array(x[1]))) image_array_flatten = image_array.map(lambda x: (x[0], x[1].flatten())) image_array_flatten = image_array_flatten.map(lambda x: (x[0].split('/')[-1].\ replace('.jpg', '')," ".join(np.char.mod('%d', x[1]))))\ .repartition(120).cache() image_array_flatten.saveAsTextFile("s3n://amlyelp/subset/train_image_array/")
#images.append(Image("/home/hduser/dev-materials/", "IMG_%04d_1.png" % i)) images.append(Image("/home/hduser/dev-materials/", "IMG_%04d_1.tif" % (i + 50))) #result = stitch_two(images[0], images[1]) #fileName = "stitched_two.png" #result[0].save(path="/home/hduser/dev-materials/", filename=fileName) result, images = stitch_multiple(images) output_image_name = "stitched_img_.png" result.save(path="/home/hduser/dev-materials/", filename=output_image_name) """ reading_start_time = time() images_rdd = sc.binaryFiles( 'hdfs://discus-p2irc-master:54310/user/hduser/registration_images_tif', 100) images_bytes = images_rdd.map(read_images) \ .map(lambda rawdata: (rawdata[0][78:79], [rawdata[1]])) \ .reduceByKey(lambda first_image, second_image: first_image + second_image) images_bytes.persist(pyspark.StorageLevel.MEMORY_AND_DISK_SER) reading_end_time = time() - reading_start_time processing_start_time = time() images_bytes.foreach(stitch_multiple) processing_end_time = time() - processing_start_time """ .groupByKey() \ .mapValues(list)
import re import os import sys import numpy as np srtm_dtype = np.dtype('>i2') filename_regex = re.compile('([NSEW]\d+[NSEW]\d+).*') # The data directory, needs to be available to all node in the cluster data_files = '/media/bitbucket/srtm/version2_1/SRTM3/North_America' # Build up the context, using the master URL sc = SparkContext('spark://ulex:7077', 'srtm') # Now load all the zip files into a RDD data = sc.binaryFiles(data_files) # The two accumulators are used to collect values across the cluster num_samples_acc = sc.accumulator(0) sum_acc = sc.accumulator(0) # Function to array def read_array(data): hgt_2darray = np.flipud(np.fromstring(data, dtype=srtm_dtype).reshape(1201, 1201)) return hgt_2darray # Function to process a HGT file def process_file(file): (name, content) = file
from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext() sqlContext = SQLContext(sc) ''' from pyspark import SparkContext from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext sc = SparkContext() sqlContext = SQLContext(sc) #df = spark.read.format("avro").load("sample.avro") df = sqlContext.read.format("com.databricks.spark.avro").load("sample.avro") df.show() sc.stop() ''' with open('data_schema.json') as f: schema = json.load(f) print(type(schema)) print(schema) rdd = sc.binaryFiles("/home/vsahu/project/spark/sample.avro").flatMap( lambda args: fastavro.reader(BytesIO(args[1]), reader_schema=schema)) print(rdd.collect()) df = rdd.toDF() #df = sqlContext.createDataFrame(rdd, schema) df.write.parquet("sample1.parquet")
def readimage(path): with open(path, "rb") as f: return bytearray(f.read()) execution_path = os.getcwd() directory = 'hdfscontentFromFlume' #directory = '/flume/eventdata' IMAGE_SIZE = (10,7.5) with tf.device('/gpu:0'): with detection_graph.as_default(): with tf.Session(graph=detection_graph) as sess: for filename in filelst: print('Fetching image byte stream '+filename) try: byteFileAsRdd = sc.binaryFiles('hdfs://localhost:9000'+filename).take(1) img_str = bytearray(byteFileAsRdd[0][1]) #img_str = readimagehdfs(filename) arr = np.asarray(img_str, dtype=np.uint8) image = cv2.imdecode( arr, -1) #img_str = readimage(directory+'/'+filename) #arr = np.asarray(img_str, dtype=np.uint8) #image = cv2.imdecode( arr, -1) if (type(image) is np.ndarray): # Fetched image now detect objects image_np = image # Expand dimensions since the model expects images to have shape: [1, None, None, 3] image_np_expanded = np.expand_dims(image_np, axis=0) image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# This is the path of the directory where the images will be stored after face is detected. # After the face is detected in the image, we will draw a rectangle around the face in the image & store that image in the below directory. rect_img_dir = './face_detected/' # Haar Cascade Classifier (from OpenCV library) # This classifier will be used to detect front faces in the images. # Give below the path of the classifier. distCascade = "./haarcascade_frontalface_default.xml" # This adds the Cascade file on different nodes in Spark cluster. # This is necessary if you run this spark code on muti-node spark cluster. sc.addFile(distCascade) # Converting the images into RDD images_RDD = sc.binaryFiles(img_dir) # For more details about this function. You can do help(sc.binaryFiles) # If you have large number of images to process (like a million) then the Spark will by default make a lot of partitions. # To repartition your image data into less number of partitions, you can run below command & change the number of partitions to what you want. #images_RDD = images_RDD.repartition(20000) # Face Detection function def face_detect(an_img_rdd_element): x = an_img_rdd_element[0] img = an_img_rdd_element[1] img_fname = x.split("/")[-1] file_bytes = np.asarray(bytearray(img), dtype=np.uint8) im = cv2.imdecode(file_bytes, 1)
imgs2 = pca_reduced_features.filter(lambda x: x[0] in z2) eucledian_distance2 = ziplist2.cartesian(imgs2) eucledian_distance2 = eucledian_distance2.map(lambda x: (calcdistance( x[0][1], x[1][1]), (x[0][0], x[1][0]))).sortByKey(ascending=True) print(eucledian_distance2.collect()) print("\n\n") if __name__ == "__main__": conf = SparkConf() conf.setAppName("Assignment 2") sc = SparkContext(conf=conf) #Reading the files rdd = sc.binaryFiles("hdfs:/data/large_sample") #Filtering the images rdd = rdd.map(lambda x: ((x[0].split("/")), x[1])).map(lambda x: (x[0][-1], x[1])) #getting the arrays corresponding to each image rdd = rdd.map(lambda x: (x[0], getOrthoTif(x[1]))) #print(rdd.collect()) #breaking the image into 25 evenly sized subimages rdd = rdd.map(lambda x: (x[0], breakimage(x[1], 500))) #print(rdd.collect()) #creating a final rdd for (imagename, array) finalrdd = rdd.flatMap(
conf.set("spark.executor.memory", '17g') # conf.set('spark.driver.memory','8g') # conf.set('spark.memory.offHeap.enabled','true') # conf.set('spark.memory.offHeap.size','15g') # conf.set('spark.cores.max','5') # conf.set('spark.driver.maxResultSize','8g') # conf.set('spark.sql.shuffle.partitions','1000') # conf.set('spark.storage.memoryFraction','0.1') conf.set('spark.yarn.executor.memoryOverhead', '3g') sc = SparkContext(conf=conf) # <h2>Create RDD of Forest Gain Images</h2> # In[310]: rdd = sc.binaryFiles("hdfs://proj-working-m:8020/user/saif/gain/") # <h2> Map function for Gain images </h2> # <p>Counts the total forest gain in a particular image. Each image cover 10x10degrees in the world map.<br> # Images have values 0's for no forest and 1's for forest gain. These outputs constitute a part of our final results # to report the total forest gain in all of South America from 2000-2012 </p> # <h3> Pipeline used:</h3> # <ul> # <li>Spark</li> # <li>Hadoop(HDFS)</li> # </ul> # In[311]: def mapper_func(x):
if __name__ == '__main__': from pyspark import SparkContext, SparkConf parser = argparse.ArgumentParser() parser.add_argument('src_tif_dir', help='Directory with files to reproject') parser.add_argument('dst_dir', help='Directory to write reproject files') parser.add_argument('--data-name', help='Optional identifer to prefix files with', default='') parser.add_argument('--dst-crs', help='CRS to reproject files to', default='EPSG:3857') parser.add_argument('--extension', help='Only consider files ending in this extension', default='') parser.add_argument('--region', help='Region for the S3 client to use', default='') args = parser.parse_args() spark_conf = SparkConf().setAppName('Rainfall-Reprojection') sc = SparkContext(conf=spark_conf) raw_tifs = sc.binaryFiles(args.src_tif_dir) if args.extension: raw_tifs = raw_tifs.filter(lambda (path, _): path.endswith(args.extension)) reprojected_tifs = raw_tifs.map( lambda (src_tif_path_remote, tif_bytes): process_tif( src_tif_path_remote, tif_bytes, args.data_name, args.dst_crs, args.dst_dir, args.region ) ) num_reprojected = reprojected_tifs.count()
# Choice of number of blocks being Blocks * Blocks Blocks = 8 # Threshold of the edge map T = 50 # Size of the filter and number to be extended filterSize = 3 numExt = (filterSize - 1) // 2 # getting an instance of spark context sc = sc() # Obtaining rdd through of hdfs hdfsDirectory = 'hdfs://localhost:9000/SampleImages/' rdd = sc.binaryFiles(hdfsDirectory + '*') # Decoding the images -- file_params (fileName, binary) rdd = rdd.map(lambda file_params: ( file_params[0], cv2.imdecode(np.asarray(bytearray(file_params[1]), dtype=np.uint8), 1))) # file_params (fileName, img) -> file_params (i, (fileName, img)) rdd = rdd.flatMap(lambda file_params: extendVertical(file_params)) # file_params (i, (fileName, img)) -> file_params ((i,j),(fileName, img)) rdd = rdd.flatMap(lambda file_params: extendHorizontal(file_params)) # Transforming the images to a gray color scale -- rdd input: file_params ((i,j),(fileName, img)) rdd = rdd.map(lambda file_params: ((file_params[0][0], file_params[0][1]), ( file_params[1][0], cv2.cvtColor(file_params[1][1], cv2.COLOR_BGR2GRAY))))
# (field0, series.y) def eval_flow_cde(x): return eval_flow_spark(x, bc_output_dir.value) def plotImage(x): t0 = time.time() distribution_plot_subsets_spark(x[1], bc_output_dir.value) print "plotImage used: ", t0 - time.time() # print("-----"+x[0],x[1]) # hdfs startTime = time.time() t0 = time.time() hdfsFile = sc.binaryFiles(input_dir).persist(StorageLevel.MEMORY_AND_DISK) #########################################################Total Time Used: 8.73000001907 adaf_objs = hdfsFile.map(read_dat_hdfs)\ .map(ExtractVIN) \ .map(process_dat_adaf).map(sort_adaf) \ .map(vehical_config) \ .filter(do_filter)\ .map(subsetMetaData) #########################################################Total Time Used: 25.1680002213 # rdd_vehical_config = hdfsFile.map(read_dat_hdfs) \ # .map(ExtractVIN) \ # .map(process_dat_adaf).map(sort_adaf) \ # .map(vehical_config) # rdd_vehical_config.count() # print("rdd_vehical_config Used Time: {}".format(time.time() - t0))
OUTPUT_FILE_TYPE = ".png" # Directory to store registered images OUTPUT_FILE_PATH = output_root_path # Directory to store processed registered images OUTPUT_PROCESSED_PATH = output_root_path + "/processed/" # Set spark configurations sc = SparkContext(appName=job_name) reading_start_time = time() # When reading from local file system #images_rdd = sc.binaryFiles('file:///sparkdata/registration_images') # When reading from HDFS images_rdd = sc.binaryFiles(input_path) # Calculate the index to use for getting images group index = images_rdd.first()[0].find("IMG_") + 4 images_group_rdd = images_rdd.map(read_images) \ .map(lambda rawdata: (rawdata[0][index:rawdata[0].rfind('_')], (rawdata[0][index:], rawdata[1]))) \ .reduceByKey(lambda first_image, second_image: (first_image + second_image)) reading_end_time = time() - reading_start_time processing_start_time = time() images_group_rdd.foreach(register_group) processing_end_time = time() - processing_start_time
t0=tbegin=time.time() if gen_num_blocks>0 and gen_block_size>0: rdd=sc.parallelize(range(gen_num_blocks),args.nodes*12*args.nparts) gen_block_count=gen_block_size*1E6/24 # 24 bytes per vector print("generating %d blocks of %d vectors each..."%(gen_num_blocks,gen_block_count)) outfile.write("generating data...\n") outfile.write("partition_multiplier: "+str(args.nparts)+"\n") outfile.write("gen_num_blocks: "+str(gen_num_blocks)+"\n") outfile.write("gen_block_size: "+str(gen_block_size)+"\n") outfile.write("total_data_size: "+str(gen_num_blocks*gen_block_size)+"\n") A=rdd.map(lambda x:generate(x,gen_block_count)) elif args.src: outfile.write("reading data...\n") outfile.write(args.src+"\n") rdd = sc.binaryFiles(args.src) A = rdd.map(parseVectors) else: print("either --src or --generate must be specified") sc.stop(); from sys import exit exit(-1) #rdd.foreach(noop) #useful to force pipeline to execute for debugging tmark=time.time() outfile.write("read/parse or generate partitions: %0.6f\n"%(tmark-t0)) outfile.write("numPartitions(%d,%s): %d\n"%(A.id(),A.name(),A.getNumPartitions())) t0=tmark # apply simple operation (V'=V+V0) shift=np.array([25.25,-12.125,6.333],dtype=np.float64)
'--partitions', default=250, type=int, help=('Number of partitions to coalesce geotiffs to else ' 'each geotiff will end up in its own partition')) parser.add_argument( '--sampling-method', default="nearest", choices=SAMPLING_METHODS.keys(), help=('Sampling method to use during reprojection') ) parser.add_argument( '--no-data-value', default=None, help='Value to represent no data if not set in original geotiff' ) args = parser.parse_args() spark_conf = SparkConf().setAppName('Azavea-Data-Hub-Reprojection') sc = SparkContext(conf=spark_conf) sampling_method = SAMPLING_METHODS.get(args.sampling_method, RESAMPLING.nearest) raw_tifs = sc.binaryFiles(args.src_tif_dir).coalesce(args.partitions) reprojected_tifs = raw_tifs.map( lambda (src_tif_path_remote, tif_bytes): reproject_tif( src_tif_path_remote, tif_bytes, args.dst_crs, sampling_method, args.no_data_value ) ) reprojected_tifs.saveAsSequenceFile(args.rdd_dst)
with client.write(outDir + '/TRANSFORM_' + image[0].split("/")[-1], overwrite=True) as writer: writer.write(buf.getvalue()) buf.close() sc = SparkContext(appName="color") sqlContext = SQLContext(sc) inputDir = argv[1] outputDir = argv[2] numPartitions = int(argv[3]) df = sqlContext.read.parquet(inputDir + '/satMetadata.parquet') first = df.first() satHeight = first[2] x = list(map(lambda x: x * satHeight, first[0])) xmin = min(x) xmax = max(x) y = list(map(lambda x: x * satHeight, first[1])) ymin = min(y) ymax = max(y) satLongitude = first[3] satSweep = first[4] date = first[5] add_seconds = date displayDate = datetime(2000, 1, 1, 12) + timedelta(seconds=add_seconds) images = sc.binaryFiles(inputDir + '/*.png', numPartitions) imageToArray = lambda rawdata: np.asarray(Image.open(BytesIO(rawdata))).astype(np.uint8) imageArrays = images.mapValues(imageToArray) imageArrays.foreachPartition( lambda x: check_call(["kinit", "-kt", "brad.keytab", "*****@*****.**"] )) imageArrays.map(lambda image: addMap(outputDir, image, satLongitude, xmin, xmax, ymin, ymax, displayDate)).collect() imageArrays.map(lambda image: transform(outputDir, image, x, y, displayDate)).collect()
if __name__ == "__main__": application_start_time = time() input_path = sys.argv[1] output_path = sys.argv[2] job_name = sys.argv[3] subprocess.call(["hadoop", "fs", "-rm", "-r", output_path]) sc = SparkContext(appName=job_name) build_start_time = time() images_rdd = sc.binaryFiles(input_path) \ .map(images_to_descriptors) \ .filter(lambda x: x[1].all() != None) \ .map(lambda x: (x[0], x[1])) features = images_rdd.flatMap(lambda x: x[1]) model = KMeans.train(features, 3, maxIterations=5, initializationMode="random") clusterCenters = model.clusterCenters build_end_time = time() - build_start_time processing_start_time = time() data_to_cluster = images_rdd.map(lambda x: [x, clusterCenters])
"flower_area_bounds" : flower_area_bounds, "flower_area_mask" : flower_area_mask} # Save/Overwrite dictionary #io.imsave(path + plot_mask_name, dict_to_save) np.save(path + plot_mask_name, dict_to_save) #setImagesFilepaths('/sparkdata/tmp-dir/2016-07-05_1207/') #sc = SparkContext("local[4]", "images_plot_mask") sc = SparkContext("spark://discus-p2irc-master:7077", "images_plot_mask") #images_read = (sc.binaryFiles('hdfs://discus-p2irc-master:54310/user/hduser/plot_images/2016-07-05_1207', 12)) images_read = (sc.binaryFiles('hdfs://discus-p2irc-master:54310/user/hduser/plot_images/2016-07-05_1207', 600)) images_bytes = (images_read.map(images_to_bytes)) images_bytes.persist(pyspark.StorageLevel.MEMORY_AND_DISK_SER) images_mask_computed = images_bytes.foreach(computePlotMask) #images_histogram_computed = images_bytes.foreach(computeHistograms) #images_histogram_computed = images_bytes.foreach(computeHistograms) print plot_mask print "==========================" print "images plot mask completed" print "=========================="
:rtype dict{}: """ sim_map = defaultdict(list) for x in svd_collect: sim_map[x[0]] = x[1] return dict(sim_map) ############ ## ## PY Spark Code Section ## ############ # Timing tracker start_time = time.time() ## Read Files rdd = sc.binaryFiles(_LOCAL_FILES_REGEX) # #rdd = sc.binaryFiles('hdfs:/data/large_sample') #rdd = sc.binaryFiles('hdfs:/data/small_sample') ## Obtain RDD as:[ (filename, tiffMatrix)...] rdd2 = rdd.map(lambda kv: getTiffAsMatrix(kv)) ## Split each matrix to 500x500x4 images RDD: [ (img-0, 500x500x4),...(img-n, 500x500x4)] rdd3 = rdd2.flatMap(lambda kv: tiffmatrixSplit(kv)) ## Collect operation for 1.E data_for_print1e = rdd3.filter(lambda x:display1e(x)).collect() ## Smooth out pixels to get RDD: [(img-0,500x500), (img-1,500x500) ...] rdd4 = rdd3.map(lambda kv: tilePixelIntensityConverter(kv)) ## Call Persist on RDD at this stage rdd4.persist() ## Call down-scale of resolution on each sub image, default factor=10 ## Gives RDD[ (img-0,50x50),(img-1,50x50)...]
# sc=SparkContext.getOrCreate() dataDir = r'hdfs:/data/large_sample/' noOfBuckets = 135 noOfBands = 4 noOfPartitions = 46 # dataDir =r'hdfs:/data/small_sample/' # noOfBuckets = 50 # noOfBands = 8 # noOfPartitions = 5 # dataDir =r'C:\Users\SSDN-Dinesh\Desktop\SBU\BDA\Assignment2\a2_small_sample' #gives key as file Path and value as binary of file data = sc.binaryFiles(dataDir) #gives key as file name and value as binary of file # fileName = data.map(lambda x:(x[0].split('/')[-1],x[1])) # outName = 'fileName' # out1 = fileName.collect() # broadCastFileNames = sc.broadcast(fileNames) #gives key as file name and array of image fullImgs = data.map(lambda x: (x[0].split('/')[-1], getOrthoTif(x[1]))) # out1 = fullImgs.collect() # outName = 'ImageShapeRDD' # # imgShape = fullImgs.map(lambda x:(x[0],x[1].shape)) # # out = imgShape.collect() # # outName = 'imgShape' #
departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2] df1 = sql_context.createDataFrame(departmentsWithEmployeesSeq1) display(df1) ''' #dado = Row("imagem", "label") #rdd = sc.parallelize(l) #images = rdd.map(lambda x: Row(id=x[0])) #dataset = sql_context.createDataFrame([dado]) path = "/Users/leopoldolusquino/Documents/Doutorado/Tese/originais/" transformador = binary_input_transformer.BinaryInputTransformer() numPartitions = 10 rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions).take(10) file_bytes = np.asarray(bytearray(rdd[0][1]), dtype=np.uint8) print(file_bytes) image = cv2.imdecode(file_bytes, 1) print(image) #print(rdd.count()) rdd = sc.binaryFiles(path, minPartitions=numPartitions).select( input_file_name(), "label".rdd) #print(image)
tmp_path = docfile os.remove(docfile) return res #for i in os.listdir(tmp_path): # if os.path.isdir(os.path.join(tmp_path, i)): # shutil.rmtree(os.path.join(tmp_path, i)) path = 'file:///home/ubuntu/chenq/docx_evaluate_score/data/all_docx/input/*.docx' path = 'file:///dev/shm/test_docx/input/*.docx' #path='file:///home/ubuntu/chenq/test_docx/input/*.docx' #path='file:////dev/shm/input/*.docx' #path='file:////dev/shm/input2/input/*.docx' #path='/user/ubuntu/docx/input/*.docx' #path='har:////user/ubuntu/ainput/docx.har/input/*.docx' rdd = sc.binaryFiles(path) #rdd.cache() #rdd.count() #rdd=rdd.repartition(32) doc = rdd.map(lambda x: (x[0], read(x[1], x[0]))) #doc.foreach(print) doc.cache() nread = doc.count() df = doc.toDF() df.write.parquet( 'file:///user/ubuntu/all_docx_10349_single_node_on_desktop5_localfile_shm_24cores_tika.parquet' ) print(nread)
from pyspark import SparkContext, SparkConf import numpy as np conf = SparkConf() conf.set('master', 'spark://hadoop-maste:7077') context = SparkContext(conf=conf) rdd = context.binaryFiles('/datas/pics/') print('applicationId:', context.applicationId) result = rdd.collect() for data in result: print(data[0], data[1][:10]) context.stop()
import sys import findspark if ("-localhost" in sys.argv): findspark.init("/u/cs451/packages/spark") import pyspark from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext(appName="ConvertToSequenceFile") input_dir_path = sys.argv[1] output_path = sys.argv[2] num_partitions = sys.argv[3] # Delete output_path if it already exists fs = (sc._jvm.org.apache.hadoop.fs.FileSystem.get( sc._jsc.hadoopConfiguration())) fs.delete(sc._jvm.org.apache.hadoop.fs.Path(output_path), True) sc.binaryFiles(input_dir_path, int(num_partitions)).saveAsSequenceFile(output_path)
import boto import datetime sc = SparkContext() # AWS S3 credentials: AWS_KEY = "" AWS_SECRET = "" sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET) directory = 's3n://amlyelp/subset/trainnew/' images = sc.binaryFiles(directory) image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata))) image_array = images.map(lambda x: (x[0],image_to_array(x[1]))) image_array_flatten = image_array.map(lambda x: (x[0],x[1].flatten())).cache() del image_array del images train = image_array_flatten.values().repartition(200).cache() clusters = KMeans.train(train, 50, maxIterations=50) clusters.save(sc, 's3n://amlyelp/subset/model/kmeans/50_iters_'+\ str(datetime.datetime.now()).replace(' ', '_')+'/')