def ops_main(opt, config): """ ssc main函数 :param opt: :param config: :return: """ # 初始化 sc = SparkContext(appName="bops-ssc") print sc.version # 加载ip库 if config.IP_FILE_PATH: sc.addFile(config.IP_FILE_PATH) else: sc.addFile(geo_db_path) # 创建Spark Streaming Context,每隔3min处理一批数据 step_num = 3 * 60 ssc = StreamingContext(sc, step_num) # monitor kafka消息处理 monitor_main(opt, config, sc, ssc, step_num) opt_main(opt, config, sc, ssc, step_num) error_logs_main(opt, config, sc, ssc, step_num) # 开始streaming 处理 ssc.start() # 等待结束,在执行过程中发生的任何异常将被抛出在这个线程 ssc.awaitTermination()
def main(): ''' Main function ST_AUTH - Object storage auth string where fna containers are found ST_USER - Ojbect storage user token ST_KEY - Ojbect storage secret token MAX_FILE_SIZE - Maximum file set parameter for makeblastdb TASKS - Number of tasks to launch, db partition factor MAKEBLASTDB - Location of makeblastdb executable OBJECT_STORES - list of source containers that built the blast db ''' # Set the context conf = SparkConf() sc = SparkContext(conf=conf) all_config = sc._conf.getAll() fasta_files = all_config['dirs'].split(",") OBJECT_STORES = ['geba'] TASKS = 3 # Quiet the logs sc.setLogLevel("WARN") # Set our spark database creation script and add all the files that are needed to be on the # remote hosts to the shall script ShellScript = "hdfs:///exec/spark_blast/spark_blastdb.bash" sc.addFile(ShellScript) sc.addFile("hdfs:///exec/spark_blast/makeblastdb") # this will be our root name for our DB names db_container = "blastdb_" + "-".join( sorted(OBJECT_STORES)) + "_" + str(TASKS) # Get the list of objects we are going to need, i.e. .fast files # Distribute our data, shuffle it in case there is any size ordering going on shuffle(fasta_files) distData = sc.parallelize(fasta_files, TASKS) # Pass our bash script our parameters, ideally we would like to pass the executor ID/Task ID, but # this doesn't appear to be available in ver 2.1.1 pipeRDD = distData.pipe(ShellScript) # Now let the bash script do its work. This will assemble and store the results of all the list of # fna files collected from each Object Store # # It has done its work--I toss it carelessly to fall where it may # -- Walt Whitman: Leaves of Grass, Book 4 - Children of Adam, Spontaneous Me print("Starting to create %d blast database 'partitions'" % TASKS) for line in pipeRDD.collect(): print(line) print("Complete")
def spark_tob_ats_parse(input_path, output_path1, output_path2): sc = SparkContext(appName="tob_ats_parse") sc.addFile("industries.csv") sc.addFile("function_taxonomy.txt") sc.addFile("dedup_majors_v1.jsonl") rdd = sc.textFile(input_path) \ .mapPartitions(tob_ats_extract_feature_mappattion) rdd.map(lambda line: filter_fea(line)) \ .map(lambda line: json.dumps(line, ensure_ascii=False)) \ .saveAsTextFile(output_path1) rdd.map(lambda line: json.dumps(line, ensure_ascii=False)) \ .saveAsTextFile(output_path2)
class mapreduce: def __init__(self, path): conf = SparkConf() self.sc = SparkContext(conf=conf) filelist = [] for filename in os.listdir(path): if filename != '.DS_Store': #for testing locally on Mac filelist.append(path + filename) self.doc = self.sc.textFile(','.join(filelist)) def wordCount(self): self.counts = self.doc.flatMap(lambda line: line.split()).map( lambda word: (word, 1)).reduceByKey( lambda x, y: x + y).sortByKey() return self.counts def doubleWordCount(self): # Create list of double words def doubleWords(line): line = line.split() doubleWords = () for i in range(len(line) - 1): doubleWords += (line[i] + ' ' + line[i + 1], ) return doubleWords self.double_counts = self.doc.flatMap(doubleWords).map( lambda doubleWord: (doubleWord, 1)).reduceByKey( lambda x, y: x + y).sortByKey() return self.double_counts def findFreq(self, filepath, filename): self.sc.addFile(filepath) # Check if the word is in the target list def isTarget(word): targetList = [] with open(SparkFiles.get(filepath.split('/')[-1])) as publicF: targetList = publicF.read().split() if word[0] in targetList: return True else: return False self.find_freq_counts = self.doc.flatMap( lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey( lambda x, y: x + y).filter(isTarget).sortByKey() return self.find_freq_counts
def save_data_to_db(): from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext conf = SparkConf().setMaster("localhost") sc = SparkContext("local[*]", "tikcket_mechine_gen") sc.setLogLevel("WARN") sc.addFile(lib_dir+'/getDistance.py') data_used_by_ticket_mechine_gen.drop() path = '/3/2014-10-15' for s in stations: full_path = data_dir_path+'v0/'+s+path print full_path data_to_save = getDistance.get_one_day_group_by_time(full_path, sc) for item in data_to_save: data_used_by_ticket_mechine_gen.insert({'station_name':s, 'time':item[0], 'data':item[1]})
def main(): # Configure Spark conf = SparkConf() conf.setAppName("Application name") # Specify the application name conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar") # Don't modify sc = SparkContext(conf=conf) # Spark Context variable that will be used for all operations running on the cluster parser = argparse.ArgumentParser() parser.add_argument("backend", type=str) parser.add_argument("helperpath", type=str) parser.add_argument("shuffle_partitions", type=str) parser.add_argument("params", type=str) parser.add_argument("inputs", type=str) parser.add_argument("features", type=str, nargs='?') args = parser.parse_args() # Swift Connection if(args.backend == 'swift'): hadoopConf = sc._jsc.hadoopConfiguration() hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem") hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens") hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443") hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/") hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME']) hadoopConf.set("fs.swift.service.SparkTest.public", "false") hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID']) hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME']) hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD']) helperpath = str(args.helperpath) # This is passed by default sc.addFile(helperpath + "/utils/helper.py") # To import custom modules shuffle_partitions = args.shuffle_partitions # Create a dict and pass it in your_module_implementation params = json.loads(args.params) inputs = json.loads(args.inputs) features = json.loads(args.features) # Only used when you want to create a feature set sqlContext = SQLContext(sc) # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions) # Don't change, required for controlling parallelism # Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs. module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
def main(): APP_NAME = "CS179G" conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("spark://spark53.cs.ucr.edu:7077") sc = SparkContext(conf=conf) sc.addFile("/home/cs179g/artistData.py") #sc.addFile("/home/cs179g/info3.txt") for i in range(1, len(sys.argv)): textRDD = sc.textFile(sys.argv[i]) words = textRDD.map(lambda x: x.split( "--------------------------------------")).map(lambda x: (x, 1)) artistRDD = words.map(lambda x: get_artist_stats(x)).distinct().filter( lambda x: x is not None).filter(lambda x: x != "").filter( lambda row: row.artist_name != "" and row.artist_name != None) temp = artistRDD.map(lambda row: {'name': row.artist_name,\ 'albums': row.artist_album_num,\ 'avg_album': row.artist_avg_album_len,\ 'avg_song': row.artist_avg_song_len,\ 'cont': row.artist_content,\ 'followers': row.artist_followers,\ 'genres': row.artist_genres,\ 'pop': row.artist_pop,\ 'songs': row.artist_song_num,\ 'sum': row.artist_sum,\ 'ref_count': row.artist_ref_count,\ 'duration': row.artist_duration,\ 'sum_word': row.artist_sum_word,\ 'cont_word': row.artist_content_word,\ 'sum_count': row.artist_sum_count,\ 'cont_count': row.artist_content_count,\ 'artist_albums': row.artist_albums,\ 'album_avg_song': row.album_avg_song,\ 'album_duration': row.album_duration,\ 'album_popularity': row.album_popularity,\ 'album_release': row.album_release ,\ 'album_tracks': row.album_tracks,\ 'artist_track': row.artist_track,\ 'track_popularity': row.track_popularity,\ 'track_duration': row.track_duration}) temp.saveToCassandra(keyspace='data', table='all_data')
def main(): APP_NAME = "CS179G" conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("spark://spark53.cs.ucr.edu:7077") #conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) sc.addFile("/home/cs179g/artistData.py") sqlContext = SQLContext(sc) test = sqlContext.read.json("fixed_info.json") test = test.map(lambda x: get_artist_stats(x)) test = test.distinct().filter(lambda x: x is not None).filter( lambda x: x != "").filter( lambda row: row.artist_name != "" and row.artist_name != None) temp = test.map(lambda row: {'name': row.artist_name,\ 'albums': row.artist_album_num,\ 'avg_album': row.artist_avg_album_len,\ 'avg_song': row.artist_avg_song_len,\ 'cont': row.artist_content,\ 'followers': row.artist_followers,\ 'genres': row.artist_genres,\ 'pop': row.artist_pop,\ 'songs': row.artist_song_num,\ 'sum': row.artist_sum,\ 'ref_count': row.artist_ref_count,\ 'duration': row.artist_duration,\ 'sum_word': row.artist_sum_word,\ 'cont_word': row.artist_content_word,\ 'sum_count': row.artist_sum_count,\ 'cont_count': row.artist_content_count,\ 'artist_albums': row.artist_albums,\ 'album_avg_song': row.album_avg_song,\ 'album_duration': row.album_duration,\ 'album_popularity': row.album_popularity,\ 'album_release': row.album_release ,\ 'album_tracks': row.album_tracks,\ 'artist_track': row.artist_track,\ 'track_popularity': row.track_popularity,\ 'track_duration': row.track_duration}) temp.saveToCassandra(keyspace='data', table='all_data')
class SparkDriver: def __init__(self, config): self.config = config log.debug('SPARK_CONFIG: {0}'.format(config)) spark_conf = SparkConf().setMaster(self.config['master']).setAppName( datetime.now().strftime('%Y%m%d%H%M%S')) self.sc = SparkContext(conf=spark_conf) self.sqlContext = SQLContext(self.sc) # 测试代码 if True: import os path = os.path.join('./', "test.txt") with open(path, "w") as testFile: _ = testFile.write("100") self.sc.addFile(path) result = self.sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect() print(">>>>>>>>>>>>>>>>>", result)
class SimplePySparkSubmit: """ ... """ sc = None def __init__(self, master="local"): ''' ... ''' from pyspark import SparkConf, SparkContext conf = (SparkConf().setMaster(master).setAppName("My app").set( "spark.executor.memory", "1g")) try: self.sc = SparkContext(conf=conf) except Exception as err: print(err) def calculate_iterator(self, iterator): ''' ... ''' from pyspark import SparkFiles path = "tests/data/test.txt" with open(path, "w") as test_file: _ = test_file.write("100") self.sc.addFile(path) with open(SparkFiles.get("test.txt")) as test_file: file_val = int(test_file.readline()) return [x * file_val for x in iterator] def test_map_reduct(self): ''' ... ''' try: stringRDD = self.sc.parallelize( ['Apple', 'Orange', 'Grape', 'Banana', 'Apple']) print( stringRDD.map(lambda f: (f, 1)).reduceByKey( lambda f, n: n + 1).collect()) except: print("Sorry")
def main(): spark_conf = SparkConf().setAppName("Spark Streaming MinHash") global sc sc = SparkContext(conf=spark_conf) sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/min_hash.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/locality_sensitive_hash.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py") sc.addFile( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py") global ssc ssc = StreamingContext(sc, config.SPARK_STREAMING_MINI_BATCH_WINDOW) ssc.checkpoint("_spark_streaming_checkpoint") kafka_stream = KafkaUtils.createDirectStream( ssc, [config.KAFKA_TOPIC], {"metadata.broker.list": config.KAFKA_SERVERS}) # Create and save MinHash and LSH or load them from file if (not os.path.isfile(config.MIN_HASH_PICKLE) or not os.path.isfile(config.LSH_PICKLE)): mh = MinHash(config.MIN_HASH_K_VALUE) lsh = LSH(config.LSH_NUM_BANDS, config.LSH_BAND_WIDTH, config.LSH_NUM_BUCKETS) util.save_pickle_file(mh, config.MIN_HASH_PICKLE) util.save_pickle_file(lsh, config.LSH_PICKLE) else: mh = util.load_pickle_file(config.MIN_HASH_PICKLE) lsh = util.load_pickle_file(config.LSH_PICKLE) # Process stream kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\ .map(lambda json_body: extract_data(json_body))\ .foreachRDD(lambda rdd: rdd.foreachPartition(lambda question: process_mini_batch(question, mh, lsh))) ssc.start() ssc.awaitTermination()
def main(): ### Initialize the SparkConf and SparkContext ### Locations of Python files. sheets_loc = "/root/IdeaNets/Synapsify/Synapsify/loadCleanly/sheets.py" lstm_class_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/lstm_class.py" load_params_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/load_params.py" preprocess_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/synapsify_preprocess.py" ### Pass Python files to Spark. pyFiles = [] pyFiles.append(sheets_loc) pyFiles.append(lstm_class_loc) pyFiles.append(load_params_loc) pyFiles.append(preprocess_loc) ### Automatically get the master node url from AWS, normally it is fixed. cmd = ["./../../spark/ec2/spark-ec2", "-r", "us-east-1", "get-master", "ruofan-cluster"] hostname = ( subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].split("\n")[2] ) ### host name of the master node. master_url = "" master_url += "spark://" master_url += hostname master_url += ":7077" # print master_url ### Initialize the spark configuration. conf = SparkConf().setAppName("ruofan").setMaster(master_url) sc = SparkContext(conf=conf, pyFiles=pyFiles) ### Add non-python files passing to Spark. sc.addFile("/root/spark/bin/nonbreaking_prefix.en") sc.addFile("/root/IdeaNets/IdeaNets/models/lstm/scode/tokenizer.perl") sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/stopwords.txt") sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/prepositions.txt") datafile = sc.wholeTextFiles( "s3n://synapsify-lstm/Synapsify_data1", use_unicode=False ) ### Read data directory from S3 storage. ### Sent the application in each of the slave node datafile.foreach(lambda (path, content): lstm_test(path, content))
def init_spark_context(): global predictionModel # load spark context conf = SparkConf().setAppName("movie_recommendation-server") # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py']) # absolute path in hdfs # to run locally, remove first slash '/' i.e my_model1, not /my_model1 predictionModel = DecisionTreeModel.load(sc, '/my_model1') sc.addFile( 'conv/6.p') sc.addFile( 'conv/7.p') sc.addFile( 'conv/8.p') sc.addFile('conv/10.p') sc.addFile('conv/12.p') sc.addFile( 'conv/36.p') return sc
def sc(self): # noqa if not self._spark_context: spark_context = SparkContext(conf=self.spark_config) assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!" spark_context.addFile(self.spex_conf.spex_file) for py_file in self.spex_conf.spark_config.py_files: spark_context.addPyFile(py_file) for file in self.spex_conf.spark_config.files: # noqa spark_context.addFile(file) for jar in self.spex_conf.spark_config.jars: # noqa spark_context.addFile(jar) self._spark_context = spark_context print_banner(self) return self._spark_context
def trial_case(results, seed=180555, context='wstack', nworkers=8, threads_per_worker=1, processes=True, order='frequency', nfreqwin=7, ntimes=3, rmax=750.0, facets=1, wprojection_planes=1, parallelism=16): npol = 1 if parallelism == -1: parallelism = None np.random.seed(seed) results['seed'] = seed start_all = time.time() results['context'] = context results['hostname'] = socket.gethostname() results['git_hash'] = git_hash() results['epoch'] = time.strftime("%Y-%m-%d %H:%M:%S") zerow = False print("Context is %s" % context) results['nworkers'] = nworkers results['threads_per_worker'] = threads_per_worker results['processes'] = processes results['order'] = order results['nfreqwin'] = nfreqwin results['ntimes'] = ntimes results['rmax'] = rmax results['facets'] = facets results['wprojection_planes'] = wprojection_planes print("At start, configuration is {0!r}".format(results)) conf = SparkConf().setMaster("local[4]") sc = SparkContext(conf=conf) sc.addFile("./LOWBD2.csv") sc.addFile("./sc256") sc.addFile("./SKA1_LOW_beam.fits") # sc.addFile("./GLEAM_EGC.fits") frequency = np.linspace(0.8e8, 1.2e8, nfreqwin) if nfreqwin > 1: channel_bandwidth = np.array(nfreqwin * [frequency[1] - frequency[0]]) else: channel_bandwidth = np.array([1e6]) times = np.linspace(-np.pi / 3.0, np.pi / 3.0, ntimes) phasecentre = SkyCoord(ra=+30.0 * u.deg, dec=-60.0 * u.deg, frame='icrs', equinox='J2000') config = 'LOWBD2' polarisation_frame = PolarisationFrame("stokesI") #add broadcast value for telescope_management_data telescope_management = telescope_management_handle_locality( sc, config, rmax) telescope_management_data = telescope_data_generate_locality( telescope_management, times=times, frequencys=frequency, channel_bandwidth=channel_bandwidth, weight=1.0, phasecentre=phasecentre, polarisation_frame=polarisation_frame, order=order) key, meta = next(telescope_management_data) print(key) print(meta["frequencys"]) broadcast_tele = sc.broadcast(telescope_management_data) vis_graph_list = create_simulate_vis_graph( sc, 'LOWBD2', frequency=frequency, channel_bandwidth=channel_bandwidth, times=times, phasecentre=phasecentre, order=order, format='blockvis', rmax=rmax) print("****** Visibility creation ******") wprojection_planes = 1 vis = None for v in vis_graph_list.collect(): if v[0][2] == 0: vis = v[1] break advice = advise_wide_field(convert_blockvisibility_to_visibility(vis), guard_band_image=6.0, delA=0.02, facets=facets, wprojection_planes=wprojection_planes, oversampling_synthesised_beam=4.0) kernel = advice['kernel'] npixel = advice['npixels2'] cellsize = advice['cellsize'] print(cellsize) print(npixel) if context == 'timeslice' or context == 'facets_timeslice': vis_slices = ntimes elif context == '2d' or context == 'facets': vis_slices = 1 kernel = '2d' else: vis_slices = advice['vis_slices'] # vis_slices = 4 results['vis_slices'] = vis_slices results['cellsize'] = cellsize results['npixel'] = npixel print(vis_slices) gleam_model_graph = create_low_test_image_from_gleam_spark( sc=sc, npixel=npixel, frequency=frequency, channel_bandwidth=channel_bandwidth, cellsize=cellsize, phasecentre=phasecentre, polarisation_frame=PolarisationFrame("stokesI"), flux_limit=0.1, applybeam=False) start = time.time() print("****** Starting GLEAM model creation ******") # gleam_model_graph.cache() # gleam_model_graph.collect() print("****** Finishing GLEAM model creation *****") end = time.time() results['time create gleam'] = end - start print("Creating GLEAM model took %.2f seconds" % (end - start)) vis_graph_list = create_predict_graph_first(gleam_model_graph, broadcast_tele, vis_slices=vis_slices, facets=facets, context=context, kernel=kernel, nfrequency=nfreqwin) start = time.time() print("****** Starting GLEAM model visibility prediction ******") # vis_graph_list.cache() # vis_graph_list.collect() end = time.time() results['time predict'] = end - start print("GLEAM model Visibility prediction took %.2f seconds" % (end - start)) # Correct the visibility for the GLEAM model print("****** Visibility corruption ******") vis_graph_list = create_corrupt_vis_graph(vis_graph_list, phase_error=1.0) start = time.time() vis_graph_list.cache() vis_graph_list.collect() end = time.time() results['time corrupt'] = end - start print("Visibility corruption took %.2f seconds" % (end - start)) # Create an empty model image model_graph = create_empty_image( vis_graph_list, npixel=npixel, cellsize=cellsize, frequency=frequency, channel_bandwidth=channel_bandwidth, polarisation_frame=PolarisationFrame("stokesI")) model_graph.cache() model_graph.collect() # psf_graph = create_invert_graph(vis_graph_list, model_graph, vis_slices=vis_slices, context=context, facets=facets, # dopsf=True, kernel=kernel) # # start = time.time() # print("****** Starting PSF calculation ******") # psfs = psf_graph.collect() # psf = None # for i in psfs: # if i[0][2] == 0: # psf = i[1][0] # end = time.time() # results['time psf invert'] = end - start # print("PSF invert took %.2f seconds" % (end - start)) # # results['psf_max'] = qa_image(psf).data['max'] # results['psf_min'] = qa_image(psf).data['min'] # # print(results['psf_max']) # print(results['psf_min']) # # # dirty_graph = create_invert_graph(vis_graph_list, model_graph, vis_slices=vis_slices, context=context, facets=facets, # kernel=kernel) # # start = time.time() # print("****** Starting dirty image calculation ******") # dirtys = dirty_graph.collect() # dirty, sumwt = (None, None) # for i in dirtys: # if i[0][2] == 0: # dirty, sumwt = i[1] # # print(psf.shape) # print(dirty.shape) # end = time.time() # results['time invert'] = end - start # print("Dirty image invert took %.2f seconds" % (end - start)) # print("Maximum in dirty image is ", numpy.max(numpy.abs(dirty.data)), ", sumwt is ", sumwt) # qa = qa_image(dirty) # results['dirty_max'] = qa.data['max'] # results['dirty_min'] = qa.data['min'] # # start = time.time() # print("***** write data to file *****") # export_images_to_fits(psfs, nfreqwin, "psf.fits") # export_images_to_fits(dirtys, nfreqwin, "dirty.fits") # end = time.time() # results['time write'] = end - start print("****** Starting ICAL ******" + " parallelism = " + str(parallelism)) start = time.time() residual_graph, deconvolve_graph, restore_graph = create_ical_graph_locality( sc, vis_graph_list, model_graph, nchan=nfreqwin, context=context, vis_slices=vis_slices, facets=facets, first_selfcal=1, algorithm='msclean', nmoments=3, niter=1000, fractional_threshold=0.1, scales=[0, 3, 10], threshold=0.1, nmajor=5, gain=0.7, timeslice='auto', global_solution=True, window_shape='quarter', parallelism=parallelism) deconvolveds = deconvolve_graph.collect() residuals = residual_graph.collect() restores = restore_graph.collect() end = time.time() results['time ICAL'] = end - start print("ICAL graph execution took %.2f seconds" % (end - start)) residual = None for i in residuals: if i[0][2] == 0: residual = i[1][0] print(residual) qa = qa_image(residual) results['residual_max'] = qa.data['max'] results['residual_min'] = qa.data['min'] export_images_to_fits(residuals, nfreqwin, "pipelines-timings-delayed-ical_residual.fits") deconvolve = None for i in deconvolveds: if i[0][2] == 0: deconvolve = i[1] print(deconvolve) qa = qa_image(deconvolve) results['deconvolved_max'] = qa.data['max'] results['deconvolved_min'] = qa.data['min'] export_images_to_fits(deconvolveds, nfreqwin, "pipelines-timings-delayed-deconvolved.fits", has_sumwt=False) restore = None for i in restores: if i[0][2] == 0: restore = i[1] print(restore) qa = qa_image(restore) results['restored_max'] = qa.data['max'] results['restored_min'] = qa.data['min'] export_images_to_fits(restores, nfreqwin, "pipelines-timings-delayed-restored.fits", has_sumwt=False) end_all = time.time() results['time overall'] = end_all - start_all print("At end, results are {0!r}".format(results)) sc.stop() return results
# Dummy Spark App demo from pyspark import SparkContext, SparkConf from pyspark import SparkFiles import numpy as np from barista.customer import Customer conf = SparkConf().setAppName("Dummy Demo") sc = SparkContext(conf=conf) # Add prototxt files to Spark Context sc.addFile("models/solver.prototxt") sc.addFile("models/train_val.prototxt") # Add barista module sc.addPyFile("barista.zip") sc.addPyFile("barista/start.py") # Subclass generic barista Customer class MyCustomer(Customer): def __init__(self, filename): compute_semaphore, model_semaphore, handles = \ Customer.parse_ipc_interface_file(filename) Customer.__init__(self, compute_semaphore, model_semaphore, handles) def update_data(self): self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape) self.arrays['label'][:] = np.random.choice( xrange(10), size=self.arrays['label'].shape)
import os from pyspark import SparkFiles, SparkConf, SparkContext # sparkConf = SparkConf().setAppName("cz").setMaster("local[2]") # sc = SparkContext(sparkConf) sc = SparkContext('local[1]', 'pyspark') tempdir = "D:\panrui\我的桌面\learning file\data\\" path = os.path.join(tempdir, "test.txt") with open(path, "w") as TextFile: _ = TextFile.write("100") sc.addFile(path) def func(iterator): with open(SparkFiles.get("test.txt")) as textFile: fileVal = int(textFile.readline()) return [x * fileVal for x in iterator] if __name__ == '__main__': sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
from pyspark.sql.types import * from pyspark.ml.clustering import * from pyspark.ml.feature import * from pyspark.ml.linalg import * from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT from pyspark.mllib.linalg.distributed import IndexedRowMatrix from operator import add from distribute_riak import * from sklearn.neighbors import LSHForest import numpy as np #create spark context and SQL context sc = SparkContext(appName="Recommend") sqlContext = SQLContext(sc) sc.addFile("settings.yaml") sc.addPyFile("distribute_riak.py") #load settings.yaml with open("settings.yaml", 'r') as stream: try: settings = yaml.load(stream) except yaml.YAMLError as exc: print(exc) #read in vector data from S3 subreddit_vectors_df = sqlContext.read.parquet(settings['subreddit-vectors']) author_vectors_df = sqlContext.read.parquet(settings['author-vectors']) #filter out inactive subs inactive_subs = sqlContext.read.parquet(
def runFPGrowth(data, minSupport): freqItems = getFrequentItems(data, minSupport) freqItemsets = getFrequentItemsets(data, minSupport, freqItems) return freqItemsets if __name__ == "__main__": APP_NAME = "FPGrowth" conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") ##comment this if working on server sc = SparkContext(conf=conf) sc.addFile("fpTree.py") # sc.setLogLevel("ERROR") finput = sys.argv[1] foutput = sys.argv[2] numPartitions = int(sys.argv[4]) # file = open("output.txt",'w+') threshold = float(sys.argv[3]) data = sc.textFile( finput, numPartitions).map(lambda x: [int(y) for y in x.strip().split(' ')]) minSupport = data.count() * threshold / 100 freqItems = getFrequentItems(data, minSupport) rank = dict([(index, item) for (item, index) in enumerate(freqItems)])
def main(): conf = (SparkConf() .setMaster("local[*]") .setAppName("compare_engine")) sc = SparkContext(conf = conf) sc.setLogLevel('INFO') sc.addFile(primary) # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() rdd_primary.partitionBy(10).cache() os.system('rm -Rf collects_*') os.system('rm -Rf holder.txt') rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct() rdd_secondary.partitionBy(10).cache() primary_count = rdd_primary.count() primary_report['count'] = primary_count print(primary_report) secondary_count = rdd_secondary.count() secondary_report['count'] = secondary_count print(secondary_report) # Return each Primary file line/record not contained in Secondary not_in_primary = rdd_primary.subtract(rdd_secondary) primary_diff = not_in_primary.count() primary_report['diff'] = primary_diff os.system('rm -Rf collects_*.csv') primary_dir = 'collects_{}_primary'.format(run_date) primary_report_name = 'collects_{}_primary_report.csv'.format(run_date) not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir) # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date)) os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name)) os.system('wc -l collects_{}_primary_report.csv'.format(run_date)) # Flip Primary Vs Secondary # Return each Secondary file line/record not contained in Primary not_in_secondary = rdd_secondary.subtract(rdd_primary) secondary_diff = not_in_secondary.count() secondary_report['diff'] = secondary_diff not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date)) os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date)) os.system('wc -l collects_{}_secondary_report.csv'.format(run_date)) process_report['primary'] = primary_report process_report['secondary'] = secondary_report print("=" * 100) print('\n') print(process_report) print('\n') print("=" * 100) spark_details(sc) sc.stop()
# fetch the results result = map(lambda x: (x[0], json.loads(x[1].data)), requests) # remove any empty results and return return filter(lambda x: x[1] is not None, result) def fetchCallSigns(input): """Fetch call signs""" return input.mapPartitions(lambda callSigns: processCallSigns(callSigns)) contactsContactList = fetchCallSigns(validSigns) # Compute the distance of each call using an external R program distScript = os.getcwd()+"/src/R/finddistance.R" distScriptName = "finddistance.R" sc.addFile(distScript) def hasDistInfo(call): """Verify that a call has the fields required to compute the distance""" requiredFields = ["mylat", "mylong", "contactlat", "contactlong"] return all(map(lambda f: call[f], requiredFields)) def formatCall(call): """Format a call so that it can be parsed by our R program""" return "{0},{1},{2},{3}".format( call["mylat"], call["mylong"], call["contactlat"], call["contactlong"]) pipeInputs = contactsContactList.values().flatMap(
import itertools import sys def _getCountryByIP(ip): citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb')) return (citydb.city(ip).country.name or u'Unknown').encode() if __name__ == '__main__': if len(sys.argv) != 3: print >> sys.stderr, "Usage: forgeInternationalAccess <date> <hour>" exit(-1) spark = SparkContext(appName='ForgeGeoAccess') spark.addPyFile('hdfs://digiledap/user/spark/share/lib/accessLogParser.py') spark.addFile('hdfs://digiledap/user/spark/share/lib/GeoLite2-City.mmdb') from accessLogParser import * from snakebite.client import Client hdfsHandle = Client('hmaster01') hosts = spark.parallelize(hdfsHandle.ls(['/flume/events/apache_access_combined/']))\ .filter(lambda dirs: dirs['file_type'] == 'd')\ .map(lambda directory: 'hdfs://digiledap%s' % directory['path'])\ .collect() rdds = { item.split('/')[-1]: spark.textFile('%s/%s/%s' % (item, sys.argv[1], sys.argv[2])) for item in hosts } results = {
from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from pyspark.mllib.regression import LabeledPoint from random import randint from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.linalg import SparseVector from pyspark import SparkContext from pyspark import SparkFiles from functools import partial trainF="./data/train" #the path to where the train data is sc = SparkContext(appName="Simple App") #initialize the spark context #since we are not in the command line interface we need to add to the spark context #some of our classes so that they are available to the workers sc.addFile("/home/christos.giatsidis/data_camp_2015_dec/helpers.py") sc.addFile("/home/christos.giatsidis/data_camp_2015_dec/exctract_terms.py") #now if we import these files they will also be available to the workers from helpers import * import exctract_terms as et # load data : data is a list with the text per doc in each cell. #Y is the respective class value #1 :positive , 0 negative print "loading local data" data,Y=lf.loadLabeled(trainF) print "preprocessing" pp.proc(data) #clean up the data from number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
import re import sys from pyspark import SparkContext #Create Spark Context with the master details and the application name sc = SparkContext("spark://localhost:7077", "max_temperature") #Add a file to be downloaded with this Spark job on every node. sc.addFile("/home/bigdatavm/Code/Spark/filter_weather_records.rb") #Create an RDD from the input data in HDFS weatherData = sc.textFile("hdfs://localhost:9000/user/bigdatavm/input") #Transform the data to extract/filter and then find the max temperature max_temperature_per_year = weatherData.pipe("../ruby/filter_weather_records.rb").map(lambda x: (x.split("\t")[0], x.split("\t")[1])).reduceByKey(lambda a,b : a if int(a) > int(b) else b).coalesce(1) #Save the RDD back into HDFS max_temperature_per_year.saveAsTextFile("hdfs://localhost:9000/user/bigdatavm/output")
def main(): config = configparser.ConfigParser() config.read('config.ini') #Path for Gromacs project gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path')) #Path where PDB ligand are - They are NOT participated in docking pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path where all pdb receptor are path_receptor_pdb = config.get('DEFAULT', 'pdb_path') #Path for saving pdb files of models generated by VS path_analysis_pdb = get_directory_pdb_analysis(path_analysis) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) #Adding bash scripts sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_total.sh")) sc.addFile(os.path.join(path_spark_drugdesign,"make_sasa_rec_buried_area_total.sh")) #Parameters form command line #Indicates probe. Example: 0.14 probe = float(sys.argv[1]) #Indicates ndots. Example: 24 ndots = int(sys.argv[2]) #Broadcast path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb) gromacs_path = sc.broadcast(gromacs_path) pdb_ligand_path = sc.broadcast(pdb_ligand_path) probe = sc.broadcast(probe) ndots = sc.broadcast(ndots) start_time = datetime.now() os.environ["GMX_MAXBACKUP"]="-1" #Loading all PDB receptor files into memory list_all_pdb_receptor_files_path = [] all_receptor_for_complex = get_files_pdb(path_receptor_pdb) for receptor in all_receptor_for_complex: list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor)) #Computing Buried areas for pdb_receptor_files in list_all_pdb_receptor_files_path: #Getting receptor name by fully path base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0])) #PDB file loaded into memory is sent by broadcast pdb_file_receptor = pdb_receptor_files[1] pdb_file_receptor = sc.broadcast(pdb_file_receptor) #Loading PDB model files based on receptor into memory base_file_name_receptor_for_filter = base_file_name_receptor+"_-_" all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter) all_model_for_complexRDD = sc.parallelize(all_model_for_complex) all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect() # ********** Starting function ********************************************************** def compute_buried_area(pdb_complex): chZ = "chZ" sasa_complex = -1.0 sasa_rec = -1.0 sasa_lig = -1.0 buried_total = -1.0 returned_list = [] try: base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg") f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg") f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg") # Makes the index file with the ligand (chain z) and the rest (non chain z) script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # Makes f_temp_sasa_rec file script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " + f_temp_sasa_lig process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex) sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec) sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig) buried_total = sasa_rec + sasa_lig - sasa_complex #Generating result - See column sorting because resultaed file will be created based on this sorting returned_list = (base_name, buried_total) except: returned_list = (base_name, float(0)) #Deleting files if os.path.exists(f_ndx): os.remove(f_ndx) if os.path.exists(f_temp_sasa_complex): os.remove(f_temp_sasa_complex) if os.path.exists(f_temp_sasa_rec): os.remove(f_temp_sasa_rec) if os.path.exists(f_temp_sasa_lig): os.remove(f_temp_sasa_lig) return returned_list # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def save_model_receptor(list_receptor_model_file): receptor_file = pdb_file_receptor.value #Obtained from broadcast model_file = list_receptor_model_file[0] full_path_for_save_complex = list_receptor_model_file[1] #Open file for writting the complex f_compl = open(full_path_for_save_complex, "w") #Insert lines of receptor for item in receptor_file: f_compl.write(item) #Insert lines of model and insert Z chain for item in model_file: item = replace_chain_atom_line(item,"d","z") f_compl.write(item) f_compl.close() # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def build_list_model_for_complex(model): full_path_model = model[0] model_file = model[1] path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast #Building complex file based on model file name base_name_model = get_name_model_pdb(full_path_model) complex_name = "compl_"+base_name_model+".pdb" full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name) list_receptor_model_file = (model_file, full_path_for_save_complex) save_model_receptor(list_receptor_model_file) list_ret = compute_buried_area(full_path_for_save_complex) os.remove(full_path_for_save_complex) return list_ret # ********** Finish function ********************************************************** all_model_filesRDD = sc.parallelize(all_model_filesRDD) all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect() #Saving buried area of receptor full_area_file = os.path.join(path_analysis,base_file_name_receptor+".area") save_receptor_buried_area(full_area_file, all_model_filesRDD) #Loading all area file all_area_file = os.path.join(path_analysis,"*.area") buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_area_files).collect() #Sorting by buried_total column buried_area_sorted_by_buried_total = sorting_buried_area(sc, buried_areaRDD) buried_area_sorted_by_buried_total.cache() buried_area_sorted_by_buried_total_LIST = buried_area_sorted_by_buried_total.map(lambda p: (p.pose, p.buried_total) ).collect() #Saving buried area file path_file_buried_area = os.path.join(path_analysis, "summary_buried_areas_total.dat") save_buried_area(path_file_buried_area, buried_area_sorted_by_buried_total_LIST) #Calculating normalized buried area #Loading database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") number_pose_ligandRDD = buried_area_sorted_by_buried_total.map(lambda p: Row(buried_total=int(p.buried_total), ligand=get_ligand_from_receptor_ligand_model(p.pose), pose=str(p.pose) ) ).collect() number_pose_ligand_table = sqlCtx.createDataFrame(number_pose_ligandRDD) number_pose_ligand_table.registerTempTable("buried_area_total_sort") sql = """ SELECT pose, (b.buried_total / a.heavyAtom) as normalized_buried_area FROM database a JOIN buried_area_total_sort b ON b.ligand = a.ligand ORDER BY normalized_buried_area DESC """ #Getting all data full_dataRDD = sqlCtx.sql(sql) #Saving normalized buried area file path_file_buried_area = os.path.join(path_analysis, "summary_normalized_buried_areas.dat") save_normalized_buried_area(path_file_buried_area, full_dataRDD) #Removing all area files all_area_files = get_files_area(path_analysis) for area_file in all_area_files: os.remove(area_file) finish_time = datetime.now() save_log(finish_time, start_time)
def main(): config = configparser.ConfigParser() config.read('config.ini') #Path for Gromacs project gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path')) #Path where PDB ligand are - They are NOT participated in docking pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Path where all pdb receptor are path_receptor_pdb = config.get('DEFAULT', 'pdb_path') #Path for saving pdb files of models generated by VS path_analysis_pdb = get_directory_pdb_analysis(path_analysis) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"os_util.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py")) #Adding bash scripts sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_ligand.sh")) #Parameters form command line #Indicates probe. Example: 0.14 probe = float(sys.argv[1]) #Indicates ndots. Example: 24 ndots = int(sys.argv[2]) #Broadcast path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb) gromacs_path = sc.broadcast(gromacs_path) pdb_ligand_path = sc.broadcast(pdb_ligand_path) probe = sc.broadcast(probe) ndots = sc.broadcast(ndots) start_time = datetime.now() os.environ["GMX_MAXBACKUP"]="-1" #Loading all PDB receptor files into memory list_all_pdb_receptor_files_path = [] all_receptor_for_complex = get_files_pdb(path_receptor_pdb) for receptor in all_receptor_for_complex: list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor)) for pdb_receptor_files in list_all_pdb_receptor_files_path: #Getting receptor name by fully path base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0])) #PDB file loaded into memory is sent by broadcast pdb_file_receptor = pdb_receptor_files[1] pdb_file_receptor = sc.broadcast(pdb_file_receptor) #Loading PDB model files based on receptor into memory base_file_name_receptor_for_filter = base_file_name_receptor+"_-_" all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter) all_model_for_complexRDD = sc.parallelize(all_model_for_complex) all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect() # ********** Starting function ********************************************************** def save_model_receptor(list_receptor_model_file): receptor_file = pdb_file_receptor.value #Obtained from broadcast model_file = list_receptor_model_file[0] full_path_for_save_complex = list_receptor_model_file[1] #Open file for writting the complex f_compl = open(full_path_for_save_complex, "w") #Insert lines of receptor for item in receptor_file: f_compl.write(item) #Insert lines of model and insert Z chain for item in model_file: item = replace_chain_atom_line(item,"d","z") f_compl.write(item) f_compl.close() # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def compute_buried_area_ligand(pdb_complex): chZ = "chZ" buried_lig_rec_perc = -1.0 buried_lig_rec = -1.0 buried_lig_lig = -1.0 buried_lig_lig_perc = -1.0 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") #ndx files f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") #xvg files xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg") xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg") xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+ xvg_temp_sasa_lig_pose + " "+ str(probe.value) + " "+ str(ndots.value) + " "+ xvg_temp_sasa_lig_complex + " "+ pdb_before_vs + " "+ xvg_temp_sasa_lig_min process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # SASA of the isolated ligand in the pose conformation sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose) # SASA of the complexed ligand in the pose conformation sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex) # SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates! sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min) # Area of the ligand which is buried in the receptor buried_lig_rec = sasa_lig_pose - sasa_lig_complex buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose # Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation buried_lig_lig = sasa_lig_min - sasa_lig_pose buried_lig_lig_perc = buried_lig_lig / sasa_lig_min returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc) #Deleting files os.remove(f_ndx) os.remove(xvg_temp_sasa_lig_pose) os.remove(xvg_temp_sasa_lig_complex) os.remove(xvg_temp_sasa_lig_min) return returned_list # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def build_list_model_for_complex(model): full_path_model = model[0] model_file = model[1] path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast #Building complex file based on model file name base_name_model = get_name_model_pdb(full_path_model) complex_name = "compl_"+base_name_model+".pdb" full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name) list_receptor_model_file = (model_file, full_path_for_save_complex) save_model_receptor(list_receptor_model_file) list_ret = compute_buried_area_ligand(full_path_for_save_complex) os.remove(full_path_for_save_complex) return list_ret # ********** Finish function ********************************************************** all_model_filesRDD = sc.parallelize(all_model_filesRDD) all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect() #Saving buried area of residue receptor full_area_file = os.path.join(path_analysis,base_file_name_receptor+".ligandArea") save_buried_area_ligand(full_area_file, all_model_filesRDD) #Loading all area file all_area_file = os.path.join(path_analysis,"*.ligandArea") buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_ligandArea_files).collect() #Sorting by buried_lig_lig column buried_area_sorted_by_buried_lig_rec = sorting_buried_area_ligand(sc, buried_areaRDD) buried_area_sorted_by_buried_lig_rec = buried_area_sorted_by_buried_lig_rec.map(lambda p: (p.pose, p.buried_lig_rec, p.buried_lig_rec_perc, p.buried_lig_lig, p.buried_lig_lig_perc) ).collect() #p.receptor, p.ligand, p.model #Saving buried area ligand file path_file_buried_area = os.path.join(path_analysis, "summary_buried_area_ligand.dat") save_buried_area_ligand_sort(path_file_buried_area, buried_area_sorted_by_buried_lig_rec) #Removing all area files all_area_files = get_files_ligandArea(path_analysis) for area_file in all_area_files: os.remove(area_file) finish_time = datetime.now() save_log(finish_time, start_time)
def _locate(example_name): return "../examples/smalldata/" + example_name conf = SparkConf().setAppName("ChicagoCrimeTest").setIfMissing("spark.master", os.getenv("spark.master", "local[*]")) sc = SparkContext(conf=conf) # SQL support sqlContext = SQLContext.getOrCreate(sc) # Start H2O services h2oContext = H2OContext(sc).start() # Define file names chicagoAllWeather = "chicagoAllWeather.csv" chicagoCensus = "chicagoCensus.csv" chicagoCrimes10k = "chicagoCrimes10k.csv" # Add files to Spark Cluster sc.addFile(_locate(chicagoAllWeather)) sc.addFile(_locate(chicagoCensus)) sc.addFile(_locate(chicagoCrimes10k)) # Since we have already loaded files into spark, we have to use h2o.upload_file instead of h2o.import_file since # h2o.import_file expects cluster-relative path (ie. the file on this path can be accessed from all the machines on the cluster) # but SparkFiles.get(..) already give us relative path to the file on a current node which h2o.upload_file can handle ( it uploads file # located on current node and distributes it to the H2O cluster) f_weather = h2o.upload_file(SparkFiles.get(chicagoAllWeather)) f_census = h2o.upload_file(SparkFiles.get(chicagoCensus)) f_crimes = h2o.upload_file(SparkFiles.get(chicagoCrimes10k)) # Transform weather table # Remove 1st column (date) f_weather = f_weather[1:]
# np.random.seed(1337) # To match with MATLAB # ---------------------------LOGGING---------------------------------------------- logfname = SPARK_HOME + 'log_size_' + str(GRAPH_NODES) + '_' + \ datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + '.log' logging.basicConfig(filename=logfname, filemode='w', level= logging.INFO, \ format='%(asctime)s:%(levelname)s:%(message)s', \ datefmt='%m/%d/%Y %I:%M:%S %p') logging.warn(sys.argv[0] + '\n SPARK_HOME = ' + SPARK_HOME \ + '\n p = ' + str(SQUARE_BLOCK_SIZE)) # ----------------Create new Spark config--------------------------------------------- # 0 means unlimited; if driver fails, set some value like 16g conf = SparkConf().set("spark.driver.maxResultSize", "32g") conf.set("spark.akka.frameSize", "2040") sc = SparkContext(conf=conf, appName="Commute time distances ") sqlContext = SQLContext(sc) sc.addFile(SPARK_HOME + "construct_graphs.py") # ---------------------------------------------------------------------------------- n, p = GRAPH_NODES, SQUARE_BLOCK_SIZE zfile1, zfile2 = RESULTS_DIR + 'elections-12-'+ str(n) + '-Z.mat', \ RESULTS_DIR + 'elections-16-'+ str(n) + '-Z.mat' if not os.path.exists(zfile1): RESULTS_dict = {} A1 = constructGraphs.createAdjMat(n, 12, SPARSE_GRAPH, p, sc) Z1 = commuteTimeDistancesEmbed(A1, tol, epsilon, d) RESULTS_dict['Z'] = Z1 if not os.path.exists(RESULTS_DIR): os.makedirs(RESULTS_DIR) sio.savemat(zfile1, RESULTS_dict)
def main(): config = configparser.ConfigParser() config.read('config.ini') #Path for Gromacs project gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path')) #Path where PDB ligand are - They are NOT participated in docking pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Path where all pdb receptor are path_receptor_pdb = config.get('DEFAULT', 'pdb_path') #Path for saving pdb files of models generated by VS path_analysis_pdb = get_directory_pdb_analysis(path_analysis) # Create SPARK config maxResultSize = str(config.get('SPARK', 'maxResultSize')) conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize)) # Create context sc = SparkContext(conf=conf) #Adding Python Source file #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) #Adding bash scripts sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor.sh")) sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor_res.sh")) #Parameters form command line #Indicates probe. Example: 0.14 #probe = float(sys.argv[1]) #Indicates ndots. Example: 24 #ndots = int(sys.argv[2]) #Broadcast path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb) gromacs_path = sc.broadcast(gromacs_path) pdb_ligand_path = sc.broadcast(pdb_ligand_path) #probe = sc.broadcast(probe) #ndots = sc.broadcast(ndots) start_time = datetime.now() os.environ["GMX_MAXBACKUP"]="-1" #Loading all PDB receptor files into memory list_all_pdb_receptor_files_path = [] all_receptor_for_complex = get_files_pdb(path_receptor_pdb) for receptor in all_receptor_for_complex: list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor)) for pdb_receptor_files in list_all_pdb_receptor_files_path: #Getting receptor name by fully path base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0])) #PDB file loaded into memory is sent by broadcast pdb_file_receptor = pdb_receptor_files[1] pdb_file_receptor = sc.broadcast(pdb_file_receptor) #Loading PDB model files based on receptor into memory base_file_name_receptor_for_filter = base_file_name_receptor+"_-_" all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter) all_model_for_complexRDD = sc.parallelize(all_model_for_complex) all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect() # ********** Starting function ********************************************************** def save_model_receptor(list_receptor_model_file): receptor_file = pdb_file_receptor.value #Obtained from broadcast model_file = list_receptor_model_file[0] full_path_for_save_complex = list_receptor_model_file[1] #Open file for writting the complex f_compl = open(full_path_for_save_complex, "w") #Insert lines of receptor for item in receptor_file: f_compl.write(item) #Insert lines of model and insert Z chain for item in model_file: item = replace_chain_atom_line(item,"d","z") f_compl.write(item) f_compl.close() # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def compute_buried_area_all_residues_and_receptor_area(pdb_complex): chZ = "chZ" res_buried_area_perc = -1 res_buried_area = -1 buried_receptor_system = -1 buried_receptor_res = -1 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) #output area receptor file f_output_receptor_buried_area = os.path.join(path_analysis_pdb_complex_b.value,base_name+".outAreaRecep") #ndx files #f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_ndx_temporary_index_z = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_index_z"+".ndx") f_ndx_temporary = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary"+".ndx") f_ndx_temporary_sasa = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa"+".ndx") #xvg files f_xvg_temporary_sasa_res_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res-lig"+".xvg") f_xvg_temporary_sasa_res = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res"+".xvg") f_xvg_temporary_sasa_rec_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec_lig"+".xvg") f_xvg_temporary_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_receptor = SparkFiles.get("make_ndx_buried_area_receptor.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_receptor + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_index_z + " "+ f_ndx_temporary process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() #coping file if os.path.exists(f_ndx_temporary): shutil.copy(f_ndx_temporary, f_ndx_temporary_sasa) #Get all residues for computing area receptor all_res = get_residues_receptor_from_ndx_files(f_ndx_temporary) returned_list = [] for res in all_res: script_make_ndx_buried_area_receptor_res = SparkFiles.get("make_ndx_buried_area_receptor_res.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_receptor_res + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_sasa + " "+ str(res) process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # compute surface of system - saved on xvg command = gromacs_path.value +"gmx sasa -surface complex -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # compute surface of receptor - save on xvg command = gromacs_path.value +"gmx sasa -surface rec -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() #calculate area if os.path.exists(f_xvg_temporary_sasa_res_lig): buried_receptor_system = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res_lig) else: buried_receptor_system = 0 if os.path.exists(f_xvg_temporary_sasa_res): buried_receptor_res = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res) else: buried_receptor_res = 0 res_buried_area = buried_receptor_res - buried_receptor_system if (res_buried_area > 0) and (buried_receptor_res > 0): res_buried_area_perc = res_buried_area/buried_receptor_res #Generating result result = (base_name, res, res_buried_area, res_buried_area_perc) returned_list.append(result) #Deleting files if os.path.exists(f_xvg_temporary_sasa_res_lig): os.remove(f_xvg_temporary_sasa_res_lig) if os.path.exists(f_xvg_temporary_sasa_res): os.remove(f_xvg_temporary_sasa_res) #Computing Receptor Area command = gromacs_path.value +"gmx sasa -surface complex -output rec"+ " -o "+ f_xvg_temporary_sasa_rec_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -surface rec -output rec"+ " -o "+ f_xvg_temporary_sasa_rec + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if os.path.exists(f_xvg_temporary_sasa_rec_lig): sasa_rec_lig = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec_lig) else: sasa_rec_lig = 0 if os.path.exists(f_xvg_temporary_sasa_rec): sasa_rec = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec) else: sasa_rec = 0 receptor_area = sasa_rec - sasa_rec_lig #Saving result file output_receptor_buried_area = open(f_output_receptor_buried_area, "w") output_receptor_buried_area.write(str(base_name)+" "+str(receptor_area) +"\n") output_receptor_buried_area.close() #Deleting all files if os.path.exists(f_xvg_temporary_sasa_rec_lig): os.remove(f_xvg_temporary_sasa_rec_lig) if os.path.exists(f_xvg_temporary_sasa_rec): os.remove(f_xvg_temporary_sasa_rec) if os.path.exists(f_ndx_temporary): os.remove(f_ndx_temporary) if os.path.exists(f_ndx_temporary_sasa): os.remove(f_ndx_temporary_sasa) if os.path.exists(f_ndx_temporary_index_z): os.remove(f_ndx_temporary_index_z) return returned_list else: #Here means that some problem for computing area return (base_name, "NAN", float(0), float(0)) # ********** Finish function ********************************************************** # ********** Starting function ********************************************************** def build_list_model_for_complex(model): full_path_model = model[0] model_file = model[1] path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast #Building complex file based on model file name base_name_model = get_name_model_pdb(full_path_model) complex_name = "compl_"+base_name_model+".pdb" full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name) list_receptor_model_file = (model_file, full_path_for_save_complex) save_model_receptor(list_receptor_model_file) list_ret = compute_buried_area_all_residues_and_receptor_area(full_path_for_save_complex) if os.path.exists(full_path_for_save_complex): os.remove(full_path_for_save_complex) return list_ret # ********** Finish function ********************************************************** #Computing buried area of All-residues and receptor all_model_filesRDD = sc.parallelize(all_model_filesRDD) all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect() full_area_file = os.path.join(path_analysis,base_file_name_receptor+".recepArea") save_receptor_buried_area_receptor(full_area_file, all_model_filesRDD) # ***************** Starting ******************************************/ #Loading All-residues files all_area_file = os.path.join(path_analysis,"*.recepArea") buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_recepArea_files).collect() #Sorting by res_buried_area_perc column buried_area_sorted_by_res_buried_area_perc = sorting_buried_area_all_residues(sc, buried_areaRDD) buried_area_sorted_by_res_buried_area_perc = buried_area_sorted_by_res_buried_area_perc.map(lambda p: (p.res, p.res_buried_area, p.res_buried_area_perc, p.pose) ).collect() #p.receptor, p.ligand, p.model, #Saving buried area file path_file_buried_area = os.path.join(path_analysis, "all-residue_buried_areas.dat") save_buried_area_recep(path_file_buried_area, buried_area_sorted_by_res_buried_area_perc) #Removing all area files all_area_files = get_files_recepArea(path_analysis) for area_file in all_area_files: os.remove(area_file) # ***************** Finish ******************************************/ # ***************** Starting ******************************************/ #Loading outAreaRecep files all_outAreaRecep_file = os.path.join(path_analysis_pdb,"*.outAreaRecep") buried_outAreaRecepRDD = sc.textFile(all_outAreaRecep_file).map(loading_lines_from_outAreaRecep_files).collect() buried_outAreaRecepRDD_sort_by_buried_lig_rec = sorting_by_buried_lig_rec(sc, buried_outAreaRecepRDD) buried_outAreaRecepRDD_sort_by_buried_lig_rec = buried_outAreaRecepRDD_sort_by_buried_lig_rec.map(lambda p: (p.pose, p.buried_lig_rec) ).collect() #Saving buried area receptor file path_file_buried_area_rec = os.path.join(path_analysis, "summary_buried_areas_receptor.dat") save_buried_area_receptor_sort(path_file_buried_area_rec, buried_outAreaRecepRDD_sort_by_buried_lig_rec) #Removing all outAreaRecep files all_outAreaRecep = get_files_outAreaRecep(path_analysis) for outAreaRecep in all_outAreaRecep: os.remove(outAreaRecep) # ***************** Finish ******************************************/ finish_time = datetime.now() save_log(finish_time, start_time)
"longest_values": longest, "average_length": "%.f2" % average } res.append(result) typeCount[2] = 1 return res, typeCount if __name__ == "__main__": config = pyspark.SparkConf().setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '5'), ('spark.cores.max', '5'), ('spark.driver.memory', '8g')]) sc = SparkContext(conf=config) sc.addFile("FileInputManager.py") sc.addFile("task1_coinflippers.py") sc.addFile("task2_coinflippers.py") spark = SparkSession \ .builder \ .appName("hw2sql") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() sqlContext = SQLContext(spark) fm.iterate_files_from_file_for_task1( sc, spark, sqlContext, "/user/yy3090/input/task1_filename.txt", 0, output_path) sc.stop()
try: c_options = parser.parse_args() print "Got options:", c_options except Exception as inst: print inst parser.print_help() es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(c_options) # Setup SparkContext sc = SparkContext(appName="extract-features-"+ingestion_id+job_suffix) sc.addPyFile('hdfs://memex/user/skaraman/extract-features/network.py') sc.addPyFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.py') sc.addFile('hdfs://memex/user/skaraman/extract-features/imagenet_mean.npy') sc.addFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.npy') conf = SparkConf() log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR) # Setup HBase managers # just to be sure we will be able to write out to the table get_create_table(c_options.tab_sha1_infos_name, c_options) get_create_table(c_options.tab_update_name, c_options) # hbase managers hbase_fullhost = c_options.hbase_host+':'+str(c_options.hbase_port) hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name) hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name) # Run extraction
from pyspark import SparkContext from pyspark import SparkFiles finddistance = "/home/srimlcloud/temp_dat/pySpark_projects/finddistance.R" finddistancename = "finddistance.R" sc = SparkContext("local", "SparkFile App") sc.addFile(finddistance) print "Absolute Path -> %s" % SparkFiles.get(finddistancename)
path_save_log = preparing_path(path_save_log) make_directory(path_save_log) path_save_output = preparing_path(path_save_output) make_directory(path_save_output) # Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign, "docking_description.py")) # Broadcast vina_path = sc.broadcast(vina_path) pdbqt_ligand_path = sc.broadcast(pdbqt_ligand_path) pdbqt_receptor_path = sc.broadcast(pdbqt_receptor_path) path_save_output = sc.broadcast(path_save_output) path_save_log = sc.broadcast(path_save_log) sc.addFile(config_vina) file_of_vina_docking = sys.argv[1] check_file_exists(file_of_vina_docking) start_time = datetime.now() def run_vina_docking(vd_obj): receptor = ''.join([pdbqt_receptor_path.value, vd_obj.get_receptor(), ".pdbqt"]) ligand = ''.join([pdbqt_ligand_path.value, vd_obj.get_ligand(), ".pdbqt"]) output_save = ''.join([path_save_output.value, vd_obj.get_receptor(),
from hdfs_paths import hdfs_path, make_hdfs_dirs # load yaml config from this dir config_path = os.path.join(os.path.dirname(__file__), "config.yaml") config = yaml.load(open(config_path)) # set up Spark conf = SparkConf() conf.set("spark.executor.instances", 8) sc = SparkContext("yarn-client", "pyspark-demo", conf=conf) # keys output in each dictionary for map_each_image. The values are np.arrays RESULT_KEYS = ["cen", "histo", "ward", "pca_fac", "pca_var", "phash"] # Do addFile so remote workers have python code sc.addFile(os.path.join(os.path.dirname(__file__), "hdfs_paths.py")) sc.addFile(os.path.join(os.path.dirname(__file__), "map_each_image.py")) sc.addFile(config_path) sc.addFile(os.path.join(os.path.dirname(__file__), "search.py")) sc.addFile(os.path.join(os.path.dirname(__file__), "fuzzify_training.py")) # These are options to the flat_map_indicators function # which can do these mappings. options_template = { "cluster_to_flattened": True, "cluster_to_key": True, "cluster_to_phash": True, # TODO it would be more efficient to combine # cluster_to_phash with cluster_to_ward "cluster_to_ward": True, "flattened_to_cluster": True,
import numpy as np from csv import reader from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql.functions import * from pyspark.sql.types import StringType, StructType, StructField from pyspark.sql.functions import col from pyspark import SparkFiles import datetime from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("building a warehouse") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) sc.addFile( "/home/hk2451/project/Cool_name_pending/codes/cleaning/cleaning_io.py") sc.addFile( "/home/hk2451/project/Cool_name_pending/codes/mutual_information.py") import cleaning_io as clean import mutual_information as mi ################################## # Inner Join and filter by year ################################## # find common name column and change it def process(df1, df2, year): colA = df1.columns colB = df2.columns colAB = list(set(colA).intersection(colB))
import time from tqdm import tqdm import re import string import os from pyspark import SparkConf, SparkContext from collections import Counter ROOT = '/data0/lucy/ingroup_lang/' DATA = ROOT + 'data/' LOG_DIR = ROOT + 'logs/' SUBREDDITS = DATA + 'subreddit_list.txt' SR_FOLDER_MONTH = ROOT + 'subreddits_month/' conf = SparkConf() sc = SparkContext(conf=conf) sc.addFile('/data0/lucy/langid.py/langid/langid.py') import langid reddits = set() def get_language(text): return langid.classify(text)[0] def id_langs(): lang_dict = {} log_file = open(LOG_DIR + 'language_id.temp', 'w') for sr in os.listdir(SR_FOLDER_MONTH): log_file.write(sr + '\n') path = SR_FOLDER_MONTH + sr + '/RC_sample' data = sc.textFile(path) data = data.filter(lambda line: not line.startswith('@@#USER#@@_'))
def test5(): sc = SparkContext('local', 'first app') file = '/home/Edison/project/chatbot/README.md' sc.addFile(file) print('------file: %s---------' % SparkFiles.get('README.md'))
# You can also add the HDFS directory path. Suppose your images are saved in HDFS in the directory '/user/maddy/my_images/'. # Then same HDFS path can be given here. img_dir = './my_images/' # This is the path of the directory where the images will be stored after face is detected. # After the face is detected in the image, we will draw a rectangle around the face in the image & store that image in the below directory. rect_img_dir = './face_detected/' # Haar Cascade Classifier (from OpenCV library) # This classifier will be used to detect front faces in the images. # Give below the path of the classifier. distCascade = "./haarcascade_frontalface_default.xml" # This adds the Cascade file on different nodes in Spark cluster. # This is necessary if you run this spark code on muti-node spark cluster. sc.addFile(distCascade) # Converting the images into RDD images_RDD = sc.binaryFiles(img_dir) # For more details about this function. You can do help(sc.binaryFiles) # If you have large number of images to process (like a million) then the Spark will by default make a lot of partitions. # To repartition your image data into less number of partitions, you can run below command & change the number of partitions to what you want. #images_RDD = images_RDD.repartition(20000) # Face Detection function def face_detect(an_img_rdd_element): x = an_img_rdd_element[0] img = an_img_rdd_element[1]
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Jun 14 06:54:46 2020 @author: joshua """ from pyspark.sql import SparkSession from pyspark.sql import Row from pyspark.sql.functions import udf, struct from pyspark.sql.types import BooleanType from pyspark import SparkContext import computedistance sc = SparkContext("local", "SparkFile App") sc.addFile("/home/ubuntu/Housing-Insight/process_datasets/computedistance.py") boroughs = ["BRONX", "BROOKLYN", "STATEN ISLAND", "QUEENS", "MANHATTAN"] def handle_building(building, _311_service): if _311_service.city in boroughs: if building.borough != _311_service.city: return False latlong = [building.longitude, building.latitude] latlong2 = [_311_service.longitude, _311_service.latitude] if computedistance.computeDistance(latlong, latlong2) < 1.5: return True else: return False
def main(InfoAppName="consumer", InfoTopic="chpart1", InfoAddPyFile="/home/test/CoreProcessAType__.py", InfoTotalCore=4, SysPath=""): global importFile, sysPath prop = configparser.RawConfigParser() prop.read('SparkConfig.properties') importFile = InfoAddPyFile sysPath = SysPath conf = SparkConf() conf.setMaster(prop.get('SparkConfig', 'spark.master')) conf.setAppName(InfoAppName) ## conf.set("spark.cores.max", InfoTotalCore) conf.set("spark.streaming.backpressure.enabled", prop.get('SparkConfig', 'spark.streaming.backpressure.enabled')) conf.set("spark.executor.memory", prop.get('SparkConfig', 'spark.executor.memory')) conf.set("spark.python.worker.memory", prop.get('SparkConfig', 'spark.python.worker.memory')) conf.set("spark.streaming.concurrentJobs", prop.get('SparkConfig', 'spark.streaming.concurrentJobs')) conf.set("spark.executor.cores", prop.get('SparkConfig', 'spark.executor.cores')) conf.set("spark.task.cpus", prop.get('SparkConfig', 'spark.task.cpus')) conf.set("spark.executor.extraLibraryPath", prop.get('SparkConfig', 'spark.executor.extraLibraryPath')) conf.set("spark.locality.wait", prop.get('SparkConfig', 'spark.locality.wait')) conf.set("spark.scheduler.mode", prop.get('SparkConfig', 'spark.scheduler.mode')) conf.set("spark.streaming.blockInterval", prop.get('SparkConfig', 'spark.streaming.blockInterval')) conf.set("spark.serializer", prop.get('SparkConfig', 'spark.serializer')) kafkaParams = { "metadata.broker.list": prop.get('KafkaConfig', 'metadata.broker.list'), "group.id": prop.get('KafkaConfig', 'group.id') } sc = SparkContext(conf=conf) sc.addPyFile(InfoAddPyFile) sc.addFile('c_count.txt') sc.addFile('p_count.txt') ssc = StreamingContext(sc, 1) topic1 = [InfoTopic] dstream = KafkaUtils.createDirectStream(ssc, topic1, kafkaParams, valueDecoder=none_decoder) parse_rdd(dstream) ssc.start() ssc.awaitTermination()
classifier.show_most_informative_features() def mapper(line, title, secfile, idsec): post = mdb.posts tokens = word_tokenize(line) tagged = pos_tag(tokens) ntities = chunk.ne_chunk(tagged) newline = line.encode('utf-8') posting = {"securitynow_id": idsec, "episode": secfile[3:6], "speaker": title, "original": line, "tokens": tokens, "entities": ntities, "sentiment": classifier.classify(dict([(word, True) for word in newline]))} post_id = post.insert(posting) sc.addFile("/home/th3m4d0n3/NetBeansProjects/twAppDemo/data_dir/allSentimentData") with open(SparkFiles.get("allSentimentData")) as f: reader = csv.reader(f, delimiter=" ", quotechar='"') jobs = bg.BackgroundJobManager() map(parseForNltk, reader) print("chezdata type DATA: {0} COUNT: {1}".format(type(chezdata), len(chezdata))) map(getHighest, chezdata) chezdataP = sc.parallelize(chezdata) lowRatedP = sc.parallelize(lowRated) highlyRatedP = sc.parallelize(highlyRated) print("chezdataP type DATA: {0} COUNT: {1}".format(type(chezdataP), chezdataP.count()))
from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.regression import LabeledPoint from numpy import array from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.classification import SVMWithSGD, SVMModel #### TODO change to the cluster directory trainF="/home/xavier.callens/DataCamp/train" #the path to where the train data is sc = SparkContext(appName="Simple App") #initialize the spark context #since we are not in the command line interface we need to add to the spark context #some of our classes so that they are available to the workers sc.addFile("helpers.py") sc.addFile("exctract_terms.py") #now if we import these files they will also be available to the workers from helpers import * import extract_terms as et # load data : data is a list with the text per doc in each cell. Y is the respective class value #1 :positive , 0 negative print "loading local data" data,Y=lf.loadLabeled(trainF) print "preprocessing" pp.proc(data) #clean up the data from number, html tags, punctuations (except for "?!." ...."?!" are replaced by "." m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function
CONF = { 'spark.driver.extraClassPath': os.environ['HOME'] + 'mongo-hadoop/spark/build/libs/mongo-hadoop-spark.jar' } STANFORD_SEGMENTER = APP_HOME + '/stanford_segmenter' STANFORD_POSTAGGER = APP_HOME + '/stanford-postagger' STANFORD_MODELS = STANFORD_POSTAGGER + '/models' LOG_DIR = 'log' MONGO_SERVER = 'localhost' MONGO_PORT = 27017 DB = 'tweets_data' ### Prepare SparkContext ### conf = SparkConf().setAppName(APP_NAME) for prop, val in CONF.items(): #set configuration properties conf.set(prop, val) sc = SparkContext(conf=conf, environment=ENV_VARS) for f in PY_FILES: #add dependencies sc.addPyFile('%s://%s/%s' % (FILESYSTEM, APP_HOME, f)) for f in FILES: #add required files sc.addFile('%s://%s/%s' % (FILESYSTEM, APP_HOME, f)) pymongo_spark.activate()
sys.path.append(SPARK_HOME_PYTHON) from pyspark import SparkContext from pyspark import SparkConf sc = SparkContext(appName = 'topXIp') #test local speed: only around 75s, much faster #sc = SparkContext('local' , 'topXIp') #X = sys.argv[1] #normal normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' normalPath = os.path.join(normalFilePath) sc.addFile(normalPath); #attack attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' attackPath = os.path.join(attackFilePath) sc.addFile(attackPath); from pyspark import SparkFiles normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache() attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache() # src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6) normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache() attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
def predict(row_coord,cSize,model): vector_dict={} for w in row_coord[1]: vector_dict[int(w[1])]=w[0] return (row_coord[0], model.value.predict(SparseVector(cSize.value,vector_dict))) trainF="./data/train" #the path to where the train data is testF="./data/test" # the path to the unlabelled data saveF="./predictions.txt" #where to save the predictions sc = SparkContext(appName=" \--(o_o)--/ ") #initialize the spark context #since we are not in the command line interface we need to add to the spark context #some of our classes so that they are available to the workers sc.addFile("/home/julien.hamilius/datacamp/code/helpers.py") sc.addFile("/home/julien.hamilius/datacamp/code/extract_terms.py") #now if we import these files they will also be available to the workers from helpers import * import extract_terms as et # load data : data is a list with the text per doc in each cell. Y is the respective class value #1 :positive , 0 negative print "loading local data" data,Y=lf.loadLabeled(trainF) print "preprocessing" pp.proc(data) #clean up the data from number, html tags, punctuations (except for "?!." ...."?!" are replaced by "." m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function
from pyspark import SparkContext as SC from pyspark.sql import SQLContext from pyspark import SparkFiles from pyspark.sql.types import * from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler sc = SC() url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv" sc.addFile(url) sqlContext = SQLContext(sc) # df = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema=True) # with inferSchema=False all values are considered string df_string = sqlContext.read.csv(SparkFiles.get("adult.csv"), header=True, inferSchema=False) def convertColumn(df, names, newType): for name in names: df = df.withColumn(name, df[name].cast(newType)) return df CONTI_FEATURES = [ 'age', 'fnlwgt', 'capital-gain', 'educational-num', 'capital-loss', 'hours-per-week' ] df_string = convertColumn(df_string, CONTI_FEATURES, FloatType()) # schema casted
import os from pyspark import SparkContext from pyspark import SparkConf from pyspark import SparkFiles conf = SparkConf().setMaster('local') sc = SparkContext(conf=conf, appName='DemoPipeR') contactsContactList = sc.parallelize([('null', '45.4,34.2,90.3,66.1'), ('null', '49.3,31.6,42.3,76.7'), ('null', '40.9,36.2,99.8,16.0')]) # Compute the distance of each call using an external R program distScript = os.getcwd() + "/find_distance.R" distScriptName = "find_distance.R" sc.addFile(distScript) def hasDistInfo(call): """Verify that a call has the fields required to compute the distance""" requiredFields = ["mylat", "mylong", "contactlat", "contactlong"] return all(map(lambda f: call[f], requiredFields)) def formatCall(call): """Format a call so that it can be parsed by our R program""" return "{0},{1},{2},{3}".format( call["mylat"], call["mylong"], call["contactlat"], call["contactlong"]) # here we do not bother with storing dictionaries in contactsContactList #pipeInputs = contactsContactList.values().flatMap(lambda calls: map(formatCall, filter(hasDistInfo, calls))) pipeInputs = contactsContactList.values()
#!/usr/bin/python # ./spark-submit spark.py fmriData.csv import sys import os from pyspark import SparkContext from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.regression import LabeledPoint from numpy import array if __name__ == "__main__": csvName = sys.argv[1] path = os.path.realpath(csvName) context = SparkContext('local', 'fmri analysis') context.addFile(path)
limit = 50 if __name__ == "__main__": sc = SparkContext(appName="MTurk") inputFilename = sys.argv[1] outputDirectory = sys.argv[2] featureListFilename = sys.argv[3] crfModelFilename = sys.argv[4] eyeRef = sys.argv[5] eyeConfig = sys.argv[6] hairRef = sys.argv[7] hairConfig = sys.argv[8] # Program to compute CRF++ c = crf_features.CrfFeatures(featureListFilename) # Add files to be downloaded with this Spark job on every node. sc.addFile("/usr/local/bin/crf_test") sc.addFile(crfModelFilename) # Map to reference sets smEye = HybridJaccard(ref_path=eyeRef, config_path=eyeConfig) smHair = HybridJaccard(ref_path=hairRef, config_path=hairConfig) rdd = sc.sequenceFile(inputFilename) if limit: rdd = sc.parallelize(rdd.take(limit)) rdd_json = rdd.mapValues(lambda x: json.loads(x)) rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))
PY_FILES = ['settings.py', 'stanford_segmenter.py', 'pos_tag.py', 'logger.py', 'pymongo_spark.py'] FILES = ['NER.model'] CONF = {'spark.driver.extraClassPath': os.environ['HOME'] + 'mongo-hadoop/spark/build/libs/mongo-hadoop-spark.jar'} STANFORD_SEGMENTER = APP_HOME + '/stanford_segmenter' STANFORD_POSTAGGER = APP_HOME + '/stanford-postagger' STANFORD_MODELS = STANFORD_POSTAGGER + '/models' LOG_DIR = 'log' MONGO_SERVER = 'localhost' MONGO_PORT = 27017 DB = 'tweets_data' ### Prepare SparkContext ### conf = SparkConf().setAppName(APP_NAME) for prop, val in CONF.items(): #set configuration properties conf.set(prop, val) sc = SparkContext(conf=conf, environment=ENV_VARS) for f in PY_FILES: #add dependencies sc.addPyFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f)) for f in FILES: #add required files sc.addFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f)) pymongo_spark.activate()
spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key) spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key) # Build up the context, using the master URL sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config) local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc' data_path = local_data_path data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc' if download: data_path = data_url # Download the file onto each node if download or config.copy_local: sc.addFile(data_path) # Still need to open dataset on master node to get number of timesteps. For # some reason the master node doesn't seem to be able to access the downloaded # version, this may be a bug in addFile(...) data = Dataset(local_data_path) pr = data.variables['pr'] # Get the number of timesteps num_timesteps = data.variables['time'].size data.close() # Now partition timesteps across the cluster timesteps = sc.parallelize(range(0, num_timesteps), 30)
if __name__ == '__main__': conf = SparkConf() sc = SparkContext(conf=conf) datadir = "/YOUR/DATA/DIR/" # sudo dpkg --configure -a # sudo apt-get install python-setuptools # sudo easy_install dateutils # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv sys.path.append('/YOUR/PYSPARK_LIBS/DIR') # replace as necessary import pyspark_csv sc.addFile('/YOUR/PYSPARK_LIBS/DIR/pyspark_csv.py') # ditto sqlContext = SQLContext(sc) # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module # to create a DataFrame and register it as a temporary table so that you can run SQL queries: print("------- ******* Task 1 ******* -------") # Task 2: let's do some basic analysis on the data. # Find how many records we have per year, and print them out sorted by year. print("------- ******* Task 2 ******* -------") # Task 3: Everyone knows that properties in London are expensive. # Find the average property price by county, # and print the top 10 most expensive counties print("------- ******* Task 3 ******* -------")
if __name__ == '__main__': conf = SparkConf() sc = SparkContext(conf=conf) datadir = "/Users/eyalbenivri/Developer/projects/spark-workshop/data/" # sudo dpkg --configure -a # sudo apt-get install python-setuptools # sudo easy_install dateutils # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv sys.path.append('/Users/eyalbenivri/Developer/libs/pyspark_libs') # replace as necessary import pyspark_csv sc.addFile('/Users/eyalbenivri/Developer/libs/pyspark_libs/pyspark_csv.py') # ditto sqlContext = SQLContext(sc) # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module # to create a DataFrame and register it as a temporary table so that you can run SQL queries: print("------- ******* Task 1 ******* -------") columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON', 'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd', 'status'] rdd = sc.textFile(datadir + "prop-prices.csv") df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns) df.registerTempTable("properties") df.persist() # Task 2: let's do some basic analysis on the data.
class SparkUtils: def __init__(self, master, app_name): if os.environ["pfe_env"] != "dev": self.sc = SparkContext(appName=app_name) self.sc.addFile('/FileProcessor.py') self.sc.addFile('/FileIndexProducer.py') self.sc.addFile('/FileIndexRepository.py') self.sc.addFile('/FileUrlProcessor.py') self.sc.addFile('/LdaTopicsDescriptionProducer.py') self.sc.addFile('/LdaTopicsDescriptionRepository.py') self.sc.addFile('/Parser.py') self.sc.addFile('/SparkProcessor.py') self.sc.addFile('/SparkUtils.py') self.sc.addFile('/TextMostCommonWordsExtractor.py') self.sc.addFile('/TextPreProcessor.py') self.sc.addFile('/TextSummarizer.py') self.sc.addFile('/thumbnail_temp.py') self.sc.addFile('/ThumbnailGenerator.py') self.sc.addFile('/NotificationConstants.py') self.sc.addFile('/RabbitMqConstants.py') else: self.sc = SparkContext(master=master, appName=app_name) self.sql_context = SQLContext(self.sc) # output rdd:(url, b'content") def read_files(self, path): return self.sc.binaryFiles(path) def rdd_to_df(self, rdd, schema): df = self.sql_context.createDataFrame(rdd, schema) return df def join_df(self, df0, df1, join_col, df0_selected_cols, df1_selected_cols): df0_selected_cols = ["df0."+x for x in df0_selected_cols] df1_selected_cols = ["df1."+x for x in df1_selected_cols] df0 = df0.alias('df0') df1 = df1.alias('df1') df = df0.join(df1, col("df0."+join_col) == col("df1."+join_col))\ .select(df0_selected_cols + df1_selected_cols) return df
return (numFailedPredictions, expectedFailedPredictions, numFalseAlarms, numGoodRecords) # Prepare desired columns desiredsmartnos = [1, 3, 5, 7, 9, 194, 197] desiredcolumns = ['date', 'serial_number', 'model', 'failure'] for sno in desiredsmartnos: desiredcolumns.append('smart_' + str(sno) + '_normalized') desiredcolumns.append('smart_' + str(sno) + '_raw') if __name__ == "__main__": sparkconf = SparkConf().setAppName('hddpredict') sparkcontext = SparkContext(conf=sparkconf) sparkcontext.addFile( 'hdfs://ec2-34-204-54-226.compute-1.amazonaws.com:9000/libsvm-322', True) sparksql = SparkSession.builder.master('local').appName( 'hddpredict').getOrCreate() # Load the entire data and project wanted columns # Then parition by individual hard disk and sort by date so we can # model partition as time series and compute rate of change of attributes. # drivedatadf = sparksql.read.csv('/user/zixian/project/input/*.csv', inferSchema = True, header = True) drivedatadf = sparksql.read.csv( 'hdfs://ec2-34-204-54-226.compute-1.amazonaws.com:9000/data/*.csv', inferSchema=True, header=True) drivedatadf = drivedatadf.select(desiredcolumns).fillna(0) drivedatadf.cache()
''' Source of school list: http://schools.nyc.gov/schoolsearch/ ''' from __future__ import print_function import sys import os from operator import add from pyspark import SparkContext from csv import reader sc = SparkContext() sc.addFile("src/helper/assign_basetype.py") from assign_basetype import * school_lines = sc.textFile("/user/ac5901/school_number.csv", 1) school_numbers = school_lines.map(lambda x: x).collect() def check_school(val): basetype = get_basetype(val) if basetype == 'TEXT' or basetype == 'INT': if val is None or len(str(val).strip()) == 0 or str(val) in [ 'Unspecified', 'NA', 'N/A', 'N?A', 'NA/' ]: return 'NULL' elif str(val) in school_numbers: return 'VALID' else: return 'INVALID'
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division, unicode_literals from pyspark import SparkConf, SparkContext, SparkFiles if __name__ == '__main__': conf = SparkConf().setAppName('Pipe') sc = SparkContext(conf=conf) column_count_script = './scripts/columncount.py' column_count_script_name = 'columncount.py' sc.addFile(column_count_script) lines = sc.parallelize(['1,2,3', '4,5', '6', '7,8,9,10']) print(lines.pipe(SparkFiles.get(column_count_script_name)).collect()) sc.stop()