def crfexec(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") crfExecutable = "/usr/local/bin/crf_test" crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model") rdd_pipeinput = sc.textFile(inputFilename) rdd_pipeinput.setName('rdd_pipeinput') # rdd_pipeinput.persist() # DON'T USE SparkFiles.get to fetch the crf_test or model # This only works with local Spark (--master local[*]) if location == 'hdfs': cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd rdd_crf = rdd_pipeinput.pipe(cmd) rdd_final = rdd_crf if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
def train_partition(idx, iterator): port = 50000 + idx % 256 main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset-%02d.hdf5" % idx) flag_file = "flags/__BARISTA_READY__.%d" % port if os.path.isfile(flag_file): os.remove(flag_file) # out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen(["python", main, architecture, model, "--dataset", dset, "--solver", solver, "--dset-size", "30000", "--initial-replay", "20000", "--debug", "--overwrite", "--port", str(port)]) while not os.path.isfile(flag_file): pass for step in iterator: dc = DummyClient("127.0.0.1", port) dc.send(barista.GRAD_UPDATE) response = dc.recv() yield response
def predict(self, X): """ Assumes X is an RDD or a list of (data, label) minibatch tuples.""" if isinstance(X, RDD): # Distribute files X.context.addFile(self._solver_filename) X.context.addFile(self._architecture_filename) X.mapPartitions(self.predict) solver_filename = \ SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1]) architecture_filename = \ SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1]) # Might need to modify path to architecture file inside solver file. # Maybe we should do this before shipping the file since all Spark # tmp directories will be identically named. net = SGDSolver(solver_filename).net for minibatch_data, minibatch_label in X: # TODO: update function call for latest Caffe net.set_input_arrays(minibatch_data, minibatch_label, self.input_index) output = net.forward(end=self.score_blob) scores = output[self.score_blob] pred = np.argmax(scores, axis=1).squeeze() yield pred
def ship_prototxt_to_data(self, rdd): rdd.context.addFile(self._solver_filename) rdd.context.addFile(self._architecture_filename) solver_filename = \ SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1]) architecture_filename = \ SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1]) return solver_filename, architecture_filename
def compute_buried_area(pdb_complex): chZ = "chZ" sasa_complex = -1.0 sasa_rec = -1.0 sasa_lig = -1.0 buried_total = -1.0 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg") f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg") f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg") # Makes the index file with the ligand (chain z) and the rest (non chain z) script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # Makes f_temp_sasa_rec file script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " + f_temp_sasa_lig process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex) sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec) sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig) buried_total = sasa_rec + sasa_lig - sasa_complex #Generating result - See column sorting because resultaed file will be created based on this sorting returned_list = (base_name, buried_total) #Deleting files os.remove(f_ndx) os.remove(f_temp_sasa_complex) os.remove(f_temp_sasa_rec) os.remove(f_temp_sasa_lig) return returned_list
def test_add_file_locally(self): path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") self.sc.addFile(path) download_path = SparkFiles.get("hello.txt") self.assertNotEqual(path, download_path) with open(download_path) as test_file: self.assertEqual("Hello World!\n", test_file.readline())
def load_timestep(timestep): path = data_path if download or config.copy_local: path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc') data = Dataset(path) pr = data.variables['pr'] step = pr[timestep] # Return valid values return (timestep, step[~step.mask])
def spawn_barista(partition): main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset.hdf5") flag_file = "flags/__BARISTA_READY__" if os.path.isfile(flag_file): os.remove("flags/__BARISTA_READY__") out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen(["python", main, architecture, model, "--dataset", dset, "--solver", solver], stdout=out, stderr=subprocess.STDOUT) while not os.path.isfile("flags/__BARISTA_READY__"): pass
def partitionIp2city(iter): from geoip2 import database def ip2city(ip): try: city = reader.city(ip).city.name except: city = 'not found' return city reader = database.Reader(SparkFiles.get(geoDBpath)) #return [ip2city(ip) for ip in iter] return ip2city(iter)
def main(sc): sqlContext = SQLContext(sc) df = sqlContext.jsonFile(DATA_PATH) #add the filter file sc.addFile(FILTER_TERMS_FILE_PATH) filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt")) global filter_terms_set_bc filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect())) # Register the DataFrame as a table. df.registerTempTable("tweet") results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'") #filter tweets to find health related tweets filter_health_tweets = results.rdd.filter(healthFilter) filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
def compute_buried_area_ligand(pdb_complex): chZ = "chZ" buried_lig_rec_perc = -1.0 buried_lig_rec = -1.0 buried_lig_lig = -1.0 buried_lig_lig_perc = -1.0 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") #ndx files f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") #xvg files xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg") xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg") xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+ xvg_temp_sasa_lig_pose + " "+ str(probe.value) + " "+ str(ndots.value) + " "+ xvg_temp_sasa_lig_complex + " "+ pdb_before_vs + " "+ xvg_temp_sasa_lig_min process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() try: # SASA of the isolated ligand in the pose conformation sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose) # SASA of the complexed ligand in the pose conformation sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex) # SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates! sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min) # Area of the ligand which is buried in the receptor buried_lig_rec = sasa_lig_pose - sasa_lig_complex buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose # Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation buried_lig_lig = sasa_lig_min - sasa_lig_pose buried_lig_lig_perc = buried_lig_lig / sasa_lig_min returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc) #Deleting files os.remove(f_ndx) os.remove(xvg_temp_sasa_lig_pose) os.remove(xvg_temp_sasa_lig_complex) os.remove(xvg_temp_sasa_lig_min) return returned_list except: return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
def partition_processor(partitionlinechunks): """ Partition logic for pyspark parallel processing """ model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf")) def set_predictions(x): segment = model_pipe_object.predict_proba(x) return segment df_with_nan = build_dataframe(partitionlinechunks) df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan) behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan) predictions_ser = set_predictions(behaviour_df) predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]] return iter(predictions_list)
def load_matrix( filename, sc, num_users=NUM_USER, num_items=NUM_SONG ): global alpha global total global num_zeros print 'Start to load matrix...' t0 = time.time() counts = np.zeros((num_users, num_items)) total = 0.0 num_zeros = num_users * num_items url = "s3n://spark-mllib/fastcode/data/" + filename # url = "hdfs://localhost:9000/data/" + filename print 'loading... ' + url # data = sc.textFile(url) # data.map(lambda l: fill_maxtrix(l, counts)) sc.addFile(url) with open(SparkFiles.get(filename)) as f: for line in f: fill_maxtrix(line, counts) alpha = num_zeros / total print 'alpha %.2f' % alpha counts *= alpha t1 = time.time() print 'Finished loading matrix in %f seconds\n' % (t1 - t0) print 'Total entry:', num_users * num_items print 'Non-zeros:', num_users * num_items - num_zeros counts = sparse.csr_matrix(counts) return counts, num_users * num_items - num_zeros
return paras if __name__ == "__main__": """ Usage: test """ # set the jsonString array of samples arr = ccm([20, 40], [1, 2], [1, 2], 250) # for local # scriptPath = "/Users/alexpb/Desktop/Lab/PySparkCUDAC/build/testPySpark" # for google cloud platform scriptPath = '/data/testPySpark' # os.system("chmod u+x %s" % scriptPath) if os.path.isfile(scriptPath): spark = SparkSession.builder.appName("PySparkCCM").getOrCreate() sc = spark.sparkContext sc.addFile(scriptPath, True) dataRDD = sc.parallelize(arr) pipeRDD = dataRDD.pipe(SparkFiles.get(scriptPath)) for x in pipeRDD.collect(): try: jsonstr = json.loads(x) result = jsonstr['result'] print(sum(result) / float(len(result))) # print(jsonstr['result']) except ValueError, e: print("not valid result received") spark.stop()
def compute_buried_area(pdb_complex): chZ = "chZ" sasa_complex = -1.0 sasa_rec = -1.0 sasa_lig = -1.0 buried_total = -1.0 returned_list = [] try: base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg") f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg") f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg") # Makes the index file with the ligand (chain z) and the rest (non chain z) script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # Makes f_temp_sasa_rec file script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " + f_temp_sasa_lig process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex) sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec) sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig) buried_total = sasa_rec + sasa_lig - sasa_complex #Generating result - See column sorting because resultaed file will be created based on this sorting returned_list = (base_name, buried_total) except: returned_list = (base_name, float(0)) #Deleting files if os.path.exists(f_ndx): os.remove(f_ndx) if os.path.exists(f_temp_sasa_complex): os.remove(f_temp_sasa_complex) if os.path.exists(f_temp_sasa_rec): os.remove(f_temp_sasa_rec) if os.path.exists(f_temp_sasa_lig): os.remove(f_temp_sasa_lig) return returned_list
BLACKLISTED_IP_FILENAME = 'greensnow.txt' THREAT_DB_PATH = os.path.join(BASE_PATH, '../threatdb') DUMPS_PATH = os.path.join(BASE_PATH, '../dumps') # SPARK CONFIG #SPARK = SparkSession.builder.master(SPARK_MASTER_URL).appName(SPARK_APP_NAME).config('spark.driver.memory','10g').config('spark.executer.memory','10g').config('spark.memory.fraction','0.6').config('spark.executor.JavaOptions', '-XX:+UseG1GC').config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer').getOrCreate() SPARK = SparkSession.builder.master( SPARK_MASTER_URL ).appName( SPARK_APP_NAME ).getOrCreate() COUNTRY_DB_FILEPATH = os.path.join(BASE_PATH, IP_DB_FILENAME) SPARK.sparkContext.addFile(COUNTRY_DB_FILEPATH) COUNTRY_DB_FILEPATH = SparkFiles.get(IP_DB_FILENAME) # INIT DIRECTORIESd create_directory(DUMPS_PATH) create_directory(TRAFFIC_LOGS_INPUT_DIR) # create_directory(TRAFFIC_LOGS_OUTPUT_DIR) create_directory(TENANT_PROFILE_OUTPUT_DIR) create_directory(TENANT_MODEL_OUTPUT_DIR) create_directory(ANOMALY_LOGS_OUTPUT_DIR) create_directory(LOG_PATH) create_directory(GRANULARIZED_LOG_PATH)
def crfprocess(sc, input, output, # specify uriClass=None to mean default it from inputType # specify uriClass=False to mean there is no uriClass filtering # specify uriClass=class name (e.g., 'Offer') to fully specify it uriClass=None, featureListFilename=configPath('features.hair-eye'), modelFilename=configPath('dig-hair-eye-train.model'), jaccardSpecs=[], imageTrainingOutput=False, # minimum initial number of partitions numPartitions=None, # number of documents to send to CRF in one call chunksPerPartition=100, # coalesce/down partition to this number after CRF coalescePartitions=None, # inputType inputType=DIG_WEBPAGE, outputFormat="text", limit=None, sampleSeed=1234, debug=0, location='hdfs'): show = True if debug>=1 else False def showPartitioning(rdd): """Seems to be significantly more expensive on cluster than locally""" if show: partitionCount = rdd.getNumPartitions() try: valueCount = rdd.countApprox(1000, confidence=0.50) except: valueCount = -1 print "At %s, there are %d partitions with on average %s values" % (rdd.name(), partitionCount, int(valueCount/float(partitionCount))) if valueCount == 0: showSizeAndExit(rdd) debugOutput = output + '_debug' def debugDump(rdd,keys=True,listElements=False): showPartitioning(rdd) keys=False if debug >= 2: startTime = time.time() outdir = os.path.join(debugOutput, rdd.name() or "anonymous-%d" % randint(10000,99999)) keyCount = None try: keyCount = rdd.keys().count() if keys else None except: pass rowCount = None try: rowCount = rdd.count() except: pass elementCount = None try: elementCount = rdd.mapValues(lambda x: len(x) if isinstance(x, (list, tuple)) else 0).values().sum() if listElements else None except: pass rdd.saveAsTextFile(outdir) endTime = time.time() elapsedTime = endTime - startTime print "wrote [%s] to outdir %r: [%s, %s, %s]" % (str(timedelta(seconds=elapsedTime)), outdir, keyCount, rowCount, elementCount) def showSizeAndExit(rdd): try: k = rdd.count() except: k = None print "Just finished %s with size %s" % (rdd.name(), k) exit(0) crfFeatureListFilename = featureListFilename crfModelFilename = modelFilename crfExecutable = binPath("crf_test_filter.sh") crfExecutable = binPath("crf_test_filter_lines.sh") crfExecutable = "apply_crf_lines.py" sc.addFile(crfExecutable) sc.addFile(crfModelFilename) # LOADING DATA if numPartitions: rdd_ingest = sc.sequenceFile(input, minSplits=numPartitions) else: rdd_ingest = sc.sequenceFile(input) rdd_ingest.setName('rdd_ingest_input') showPartitioning(rdd_ingest) # LIMIT/SAMPLE (OPTIONAL) if limit==0: limit = None if limit: # Because take/takeSample collects back to master, can create "task too large" condition # rdd_ingest = sc.parallelize(rdd_ingest.take(limit)) # Instead, generate approximately 'limit' rows ratio = float(limit) / rdd_ingest.count() rdd_ingest = rdd_ingest.sample(False, ratio, seed=sampleSeed) # FILTER TO KNOWN INTERESTING URLS (DEBUG, OPTIONAL) # For debugging, allow inclusion/exclusion of items with known behavior # If set, only those URIs so listed are used, everything else is rejected keepUris = [] # contains both hair and eyes # keepUris.append('http://dig.isi.edu/ht/data/page/2384EBCB1DD4FCA505DD05AB15F386547D05B295/1429603739000/processed') # contains both hair and eyes # keepUris.append('http://dig.isi.edu/ht/data/page/18507EEC7DD0A94A3A00F46D8B976CDFDD258723/1429603859000/processed') # contains both hair and eyeys # keepUris.append('http://dig.isi.edu/ht/data/page/442EA3A8B9FF69D65BC8B0D205C8C85204A7C799/1433150174000/processed') # for testing 'curly hair' # keepUris.append('http://dig.isi.edu/ht/data/page/681A3E68456987B1EE11616280DC1DBBA5A6B754/1429606198000/processed') if keepUris: rdd_ingest = rdd_ingest.filter(lambda (k,v): k in keepUris) # layout: pageUri -> content serialized JSON string rdd_ingest.setName('rdd_ingest_net') debugDump(rdd_ingest) # layout: pageUri -> dict (from json) rdd_json = rdd_ingest.mapValues(lambda x: json.loads(x)) rdd_json.setName('rdd_json') debugDump(rdd_json) # RETAIN ONLY THOSE MATCHING URI CLASS if uriClass==None: if inputType==DIG_WEBPAGE: uriClass='WebPage' elif inputType==DIG_OFFER: uriClass='Offer' else: raise('Unknown inputType') if uriClass: # print("Filtering on uriClass={}".format(uriClass)) rdd_relevant = rdd_json.filter(lambda (k,j): j.get("a", None)==uriClass) else: rdd_relevant = rdd_json rdd_relevant.setName('rdd_relevant') debugDump(rdd_relevant) def getIdentifiers(v): if inputType == DIG_OFFER: # We are looking for a toplevel attribute 'identifier' identifiers = v.get("identifier", None) elif inputType == DIG_WEBPAGE: # We are looking for the name field of the top level 'identifier' whose # "hasType" is "http://dig.isi.edu/thesaurus/identifier/ad-id" identifiers = [] for idObj in v.get("identifier", []): if idObj.get("hasType") == "http://dig.isi.edu/thesaurus/identifier/ad-id": name = idObj.get("name", None) if name: identifiers.extend(name) if identifiers: return asList(identifiers) else: return None def byIdentifier(k,v): result = [] identifiers = getIdentifiers(v) or ["missing"] for i in identifiers: result.append( (k + "-" + str(i), v) ) return result # Add identifier to URI if imageTrainingOutput: rdd_altered = rdd_relevant.flatMap(lambda (uri, j): byIdentifier(uri, j)) else: rdd_altered = rdd_relevant rdd_altered.setName('rdd_altered') debugDump(rdd_altered) # print "### Processing %d input pages, initially into %s partitions" % (rdd_partitioned.count(), rdd_partitioned.getNumPartitions()) # layout: pageUri -> (body tokens, title tokens) rdd_texts = rdd_altered.mapValues(lambda x: (textTokens(extract_body(x, inputType=inputType)), textTokens(extract_title(x, inputType=inputType)))) rdd_texts.setName('rdd_texts') debugDump(rdd_texts) # We use the following encoding for values for CRF++'s so-called # labels to reduce the data size and complexity of referring to # words. Each word is assigned a URI constructed from the page # URI (Karma URI) plus a 5 digit zero-padded number for the # subdocument plus a 5 digit zero-padded number for the word # index (1-based). By "subdocument" we mean the page body and # page title (for HT; there could be others in other domains). # Additionally, an artificial "separator" document is used to # generate a barrier to avoid inadvertent capture of spurious # spans between subdocuments. # # Example: the first word of the body of # http://dig.isi.edu/ht/data/page/0434CB3BDFE3839D6CAC6DBE0EBED0278D3634D8/1433149274000/processed # would be http://dig.isi.edu/ht/data/page/0434CB3BDFE3839D6CAC6DBE0EBED0278D3634D8/1433149274000/processed/00000/00001 SEPARATOR = ' ' BODY_SUBDOCUMENT = 0 SEPARATOR_SUBDOCUMENT = 1 TITLE_SUBDOCUMENT = 2 c = crf_features.CrfFeatures(crfFeatureListFilename) def makeMatrix(c, uri, bodyTokens, titleTokens): b = c.computeFeatMatrix(bodyTokens, False, addLabels=False, addIndex=False) s = c.computeFeatMatrix([SEPARATOR, ""], False, addLabels=False, addIndex=False) t = c.computeFeatMatrix(titleTokens, False, addLabels=False, addIndex=False) # BODY idx = 1 for row in b: if row == u"": pass else: label = uri + "/%05d/%05d" % (BODY_SUBDOCUMENT, idx) row.append(label) idx += 1 # SEPARATOR pseudo document idx = 1 for row in s: if row == u"": pass else: label = uri + "/%05d/%05d" % (SEPARATOR_SUBDOCUMENT, idx) row.append(label) idx += 1 # TITLE idx = 1 for row in t: if row == u"": pass else: label = uri + "/%05d/%05d" % (TITLE_SUBDOCUMENT, idx) row.append(label) idx += 1 # Keep the empty string semaphor from the title (last # component) for CRF++ purposes return b[0:-1] + s[0:-1] + t # page feature matrix including body, separator, title # (vector of vectors, includes separator rows) # layout: pageUri -> (python) vector of vectors rdd_features = rdd_texts.map(lambda (k,v): (k, makeMatrix(c, k, v[0], v[1]))) rdd_features.setName('rdd_features') debugDump(rdd_features) # unicode UTF-8 representation of the feature matrix # layout: pageUri -> unicode UTF-8 representation of the feature matrix rdd_vector = rdd_features.mapValues(lambda x: vectorToUTF8(x)) rdd_vector.setName('rdd_vector') debugDump(rdd_vector) # Disregard keys/partitioning considerations here # Drop keys put serialized vectors into lists of size chunksPerPartition, dropping any nulls, then concatenate # layout: lists of size up to chunksPerPartition of UTF8(feature vectors) rdd_chunked = rdd_vector.values().glom().map(lambda l: [filter(lambda e: e, x) for x in iterChunks(l, chunksPerPartition)]).map(lambda l: ["".join(x) for x in l]) rdd_chunked.setName('rdd_chunked') debugDump(rdd_chunked, keys=False) rdd_pipeinput = rdd_chunked.flatMap(lambda x: x).map(lambda r: b64encode(r)) rdd_pipeinput.setName('rdd_pipeinput') debugDump(rdd_pipeinput, keys=False) # base64 encoded result of running crf_test and filtering to # include only word, wordUri, non-null label # local executable = SparkFiles.get(os.path.basename(crfExecutable)) if location=="local" else os.path.basename(crfExecutable) # local model = SparkFiles.get(os.path.basename(crfModelFilename)) if location=="local" else os.path.basename(crfModelFilename) cmd = "%s %s" % (executable, model) print "### Pipe cmd is %r" % cmd rdd_pipeoutput = rdd_pipeinput.pipe(cmd) if coalescePartitions: rdd_pipeoutput = rdd_pipeoutput.coalesce(max(2, coalescePartitions)) rdd_pipeoutput.setName('rdd_pipeoutput') debugDump(rdd_pipeoutput) # base64 decoded to regular serialized string # beware newlines corresponding to empty CRr++ crf_test output # There may be a need to utf8 decode this data upon reacquisition, but believed not rdd_base64decode = rdd_pipeoutput.map(lambda x: b64decode(x)) rdd_base64decode.setName('rdd_base64decode') debugDump(rdd_base64decode) def reorg(tabsep): (word, uri, label) = tabsep.split('\t') return (uri, (word, label)) # 1. break into physical lines # 2. drop any inter-document empty string markers # 3. destructure each line into its own word, wordUri, label row # wordUri -> (word,label) rdd_tabular = rdd_base64decode.map(lambda b: b.split('\n')).flatMap(lambda x: x).filter(lambda x: x).map(lambda l: reorg(l)) rdd_tabular.setName('rdd_tabular') debugDump(rdd_tabular) def organizeByOrigDoc(uri, word, label): (parentUri, docId, wordId) = uri.rsplit('/', 2) return ( (parentUri, docId), (wordId, word, label) ) # composite key (docUri, subdocId) -> (wordId, word, label) rdd_reorg = rdd_tabular.map(lambda (uri,tpl): organizeByOrigDoc(uri, tpl[0], tpl[1])) rdd_reorg.setName('rdd_reorg') debugDump(rdd_reorg) def seqFunc(s,c): s.add(c) return s def combFunc(s1, s2): s1.update(s2) return s1 # composite key (docUri, subdocId) -> set of (wordId, word, label) rdd_agg = rdd_reorg.aggregateByKey(set(), lambda s,c: seqFunc(s,c), lambda s1,s2: combFunc(s1,s2)) rdd_agg.setName('rdd_agg') debugDump(rdd_agg) # (docUri, subDocId) -> sorted list of (wordId, word, label) rdd_grouped = rdd_agg.mapValues(lambda s: sorted(s)) rdd_grouped.setName('rdd_grouped') debugDump(rdd_grouped) def harvest(seq): allSpans = [] lastIndex = -2 lastLabel = None currentSpan = [] for (wordId, word, label) in seq: currentIndex = int(wordId) if lastIndex+1 == currentIndex and lastLabel == label: # continuing current span currentSpan.append( (currentIndex, word, label) ) else: # end current span if currentSpan: allSpans.append(currentSpan) # begin new span currentSpan = [ (currentIndex, word, label) ] lastLabel = label lastIndex = currentIndex # end current span if currentSpan: allSpans.append(currentSpan) result = [] for span in allSpans: words = [] spanLabel = None for (wordIdx, word, label) in span: spanLabel = label words.append(word) result.append( (' '.join(words), spanLabel) ) return result # ( (parentUri, docId), [ (words1, category1), (words2, category2), ... ] rdd_harvest = rdd_grouped.mapValues(lambda s: harvest(s)) rdd_harvest.setName('rdd_harvest') debugDump(rdd_harvest) # parentUri -> (words, category) # we use .distinct() because (e.g.) both title and body might mention the same feature rdd_flat = rdd_harvest.map(lambda r: (r[0][0], r[1])).flatMapValues(lambda x: x).distinct() rdd_flat.setName('rdd_flat') debugDump(rdd_flat) # We map from CRF output (category) to (potentially multiple) HJ handle(s) hjHandlers = defaultdict(list) for (category,digFeature,config,reference) in jaccardSpecs: # add one handler if config and reference: hjHandlers[category].append({"category": category, "featureName": digFeature, "hybridJaccardInterpreter": HybridJaccard(config_path=config, ref_path=reference).findBestMatch}) else: hjHandlers[category].append({"category": category, "featureName": digFeature, "hybridJaccardInterpreter": None}) def jaccard(tpl): results = [] (words, category) = tpl for handler in hjHandlers[category]: interpreter = handler["hybridJaccardInterpreter"] results.append({"featureName": handler["featureName"], "featureValue": interpreter(words) if interpreter else words, # for debugging "crfCategory": category, "crfWordSpan": words, # intended to support parametrization/provenance "featureDefinitionFile": os.path.basename(crfFeatureListFilename), "featureCrfModelFile": os.path.basename(crfModelFilename)}) return results # there could be more than one interpretation, e.g. hairColor + hairType for a given CRF category # use flatMapValues to iterate over all rdd_jaccard = rdd_flat.flatMapValues(lambda v: jaccard(v)) rdd_jaccard.setName('rdd_jaccard') debugDump(rdd_jaccard) def extendDict(d, key, value): d[key] = value return d # add in the URI for karma modeling purposes rdd_aligned = rdd_jaccard.map(lambda (uri,v): (uri, extendDict(v, "uri", uri))) rdd_aligned.setName('rdd_aligned') debugDump(rdd_aligned) # rdd_aligned = rdd_pipeinput # docUri -> json def recoverIdentifier(k): return k.rsplit('-',1)[-1] if imageTrainingOutput: rdd_final = rdd_aligned.map(lambda (k,v): (recoverIdentifier(k), (v.get("featureName"), v.get("featureValue")))).filter(lambda (k,p): p[1]!='NONE') else: rdd_final = rdd_aligned.mapValues(lambda v: json.dumps(v)) rdd_final.setName('rdd_final') debugDump(rdd_final) if rdd_final.isEmpty(): print "### NO DATA TO WRITE" else: if outputFormat == "sequence": rdd_final.saveAsSequenceFile(output) elif outputFormat == "text": rdd_final.saveAsTextFile(output) elif outputFormat == "tsv": rdd_tsv = rdd_final.map(lambda (k,p): k + "\t" + p[0] + "\t" + p[1]) rdd_tsv.saveAsTextFile(output) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
def upload_file_job(context): from pyspark import SparkFiles with open(SparkFiles.get(upload_file_name)) as testFile: file_val = testFile.readline() return file_val
def sparkFilePathMapper(self, path): """When Spark forwards files from the driver to worker nodes, it may be necessary to map the filename path on a per-worker node basis.""" # Note the implication in this code that the feature list file and # model file must have unique basenames. return SparkFiles.get(os.path.basename(path))
def update_data(self): self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape) self.arrays['label'][:] = np.random.choice( xrange(10), size=self.arrays['label'].shape) def process_model(self): pass # Create some dummy data dataRDD = sc.parallelize(xrange(100)) # Create some barista instances num_baristas = 2 start_script = 'python -m barista.start' solver = SparkFiles.get("solver.prototxt") interfaces = sc.parallelize([solver]*num_baristas, num_baristas) \ .pipe(start_script) \ .collect() # Join the data def train(interface, data): solver_filename, pid = interface.split(',') customer = MyCustomer(solver_filename) customer.run_transaction() grad_norm = np.linalg.norm(customer.arrays['conv1_dW']) return grad_norm grad_norms = dataRDD.map(lambda x: train(interfaces[0], x)).collect() print grad_norms
def find_neighbors(i): from annoy import AnnoyIndex ai = AnnoyIndex(f) ai.load(SparkFiles.get("index.ann")) return (ai.get_nns_by_vector(vector=x[1], n=5) for x in i)
rdd_json = rdd.mapValues(lambda x: json.loads(x)) rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x)) # TBD # rdd_title = rdd_json.mapValues(lambda x: extract_title(x)) # rdd_title_tokens = rdd.title.mapValues(lambda x: textTokens(x)) # all below should also be done for title # not a pair RDD? rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))) rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x)) cmd = SparkFiles.get("crf_test") + " -m " + SparkFiles.get(crfModelFilename) rdd_crf = rdd_pipeinput.values().pipe(cmd) # not a pair RDD # but we have the URI in the -3 position # and the index in the -2 position rdd_withuri = rdd_crf.map(lambda x: reconstructTuple(x)) rdd_grouped = rdd_withuri.groupByKey() rdd_flat = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))]) rdd_harvested = rdd_flat.mapValues(lambda x: computeSpans(x, indexed=True)) # This has the effect of generating 0, 1, 2, ... lines according to the number of spans rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x)) # map any eyeColor spans using smEye, hairType spans using smHair rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEye, "hairType": smHair}))
import dash_html_components as html from dash.dependencies import Input, Output external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"] app = dash.Dash(__name__, external_stylesheets=external_stylesheets) #server=app.server plt_io.templates["custom_dark"] = plt_io.templates["plotly_dark"] spark = SparkSession.builder.master('local[*]').appName('ICULux').config("spark.files.overwrite", "true")\ .config("spark.worker.cleanup.enabled","true").getOrCreate() sc = spark.sparkContext url = "https://physionet.org/files/mimicdb/1.0.0/055/05500001.txt" sc.addFile(url) # first get all lines from file with open(SparkFiles.get("05500001.txt"), 'r') as f: lines = f.readlines() # remove spaces lines = [line.replace(' ', '') for line in lines] # finally, write lines in the file #with open("temp.txt", 'w') as f: # f.writelines(lines) schema = StructType([ StructField("Name", StringType(), True), StructField("val1", StringType(), True), StructField("val2", StringType(), True), StructField("val3", StringType(), True) ])
sqlContext = SQLContext(sc) #add the file to the spark context #from pyspark.sql import SQLContext url = "201029COVID19MEXICO.csv" from pyspark import SparkFiles sc.addFile(url) sqlContext = SQLContext(sc) # In[14]: #read the csv and create a dataframe df = sqlContext.read.csv(SparkFiles.get("201029COVID19MEXICO.csv"), header=True, inferSchema= True) df.printSchema() #group and print the count of type of patient # 1 = outpatient, 2 = inpatient df.groupBy("TIPO_PACIENTE").count().sort("count",ascending=True).show() # In[15]: import pyspark.sql.functions as F #df1 = df.withColumnRenamed("TIPO_PACIENTE","Patient_Type")\.withColumnRenamed("SEXO","Sex")
def __init__(self): """Create a spark context and session""" print('__init__ called') config = configparser.ConfigParser() print( '---------------------------------------------------Start-----------------------------------------------------------------------------------------------------' ) sc = SparkContext.getOrCreate() print(sc.applicationId) print(sc.master) print('spark conf starts') spark = SparkSession(sc) for item in spark.sparkContext.getConf().getAll(): print(item) print('spark conf ends') # cwd = os.getcwd() # arr = os.listdir(cwd) # print('-------------------------------------------------------working directories-----------------------------------------------------------------------------------') # print(cwd) # print(arr) # print('----------------------------------------------------------cmd starts-----------------------------------------------------------------------------------------') ## cmd = 'hdfs dfs -ls /home/hadoop'.split() # cmd must be an array of arguments ## files = subprocess.check_output(cmd).strip().split('\n') ## for path in files: ## print (path) # print('-------------------------------applicationId-------------------------------------------------------------------------------------------------------------------') # appId = sc.applicationId # print(appId) # ip = 'ip-172-31-18-164.ec2.internal' # appPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.json' # iniPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.ini' # print(appPath) # print(iniPath) # tmp1 = 'file:///user/hadoop/.sparkStaging/' + appId + '/config.ini' # print(tmp1) ## print('-------------------------------------------------------------config reader starts-------------------------------------------------------------------------') ## strng = open(appPath, 'r').read() ## print(strng) # print('----------------------------------open ends---------------------------------------------------------------------------------') # print(SparkFiles.getRootDirectory()) # print(os.listdir(SparkFiles.getRootDirectory())) ## cmd = 'hdfs dfs -ls ' + SparkFiles.getRootDirectory() +''.split() ## files = subprocess.check_output('hdfs dfs -ls ' + SparkFiles.getRootDirectory()).strip().split('\n') ## for path in files: ## print (path) ## stg_path = str(fs.defaultFS) + "/user/" + str(os.environ['USER']) + "/.sparkStaging/" + str(sc.applicationId) + "/" lines = sc.textFile(os.path.join(stg_path,'readme.txt')) ## print(lines.collect()) # print('--------------------------------------------------getRootDirectory----------------------------------------------------------------------') # # ## configFile = pkg_resources.resource_filename(pkg_resources.Requirement.parse("myapp"), "config.ini") ## config = ConfigParser.ConfigParser() ## config.read(configFile) # # print('------------------INI file--------------------------------------------------------------------------------------------------------------------------') # conString = '' # inputFile = 'config.ini' # with open(SparkFiles.get(inputFile)) as test_file: # conString = test_file.read() # print('----------------------------------------------------------------config reader ends---------------------------------------------------------------------') # print(conString) # print('print(conString) starts') # config.read_string(conString.decode()) # print('print(conString) ends') # print(config) # val = config.get('SPARK', 'val') ## val = config['SPARK']['val'] # print(val) # print('----------------------------------------------------------val-----------------------------------------------------------------------------------------------') print( '------------------JSON file--------------------------------------------------------------------------------------------------------------------------' ) inputFile = 'config.json' # print(os.environ) # print(os.environ['SPARK_YARN_STAGING_DIR']) # print('os.environ completed') print(SparkFiles.getRootDirectory()) ipath = os.path.join(SparkFiles.getRootDirectory() + '/' + inputFile) print(ipath) conString = '' print( '-----------------------------------------------------------printing a------------------------------------------------------------' ) a = sc.textFile("file:///" + SparkFiles.get(inputFile)).collect() print(a) print( '-----------------------------------------------------------printing b------------------------------------------------------------' ) b = sc.textFile("file:///" + ipath).collect() print(b) # with open(SparkFiles.get(inputFile)) as test_file: conString = test_file.read() print( '----------------------------------------------------------------config reader ends---------------------------------------------------------------------' ) print(conString) print( '----------------------------------------------------------val-----------------------------------------------------------------------------------------------' ) #hdfs://ip-172-31-19-25.ec2.internal:8020 # print('---------------------------------------------Hadoop Files------------------------------------------------------------------------------------------------------') ## print(SparkFiles.getRootDirectory()) # print('------------------------------------------------------------Context------------------------------------------------------------------------------------------------') ## textFile = sc.textFile(appPath) # conf = spark.read.option("multiline",True).json(appPath) # print('------------------------------------------------------------------------Spark Read---------------------------------------------------------------------------') # print(conf.printSchema()) # print(conf.select('SPARK1').first()[0]) # print(sc.sparkUser()) # print('---------------------------------------------------------completed---------------------------------------------------------------------------------------') # data = sc.parallelize(list(conf['SPARK'])) data = sc.parallelize(list('HelloWorld12345')) data.map(lambda x: (x, 1)).reduceByKey(add).sortBy( lambda x: x[1], ascending=False).coalesce(1).saveAsTextFile( 'tmp/result/' + str(ts)) #s3://nithin-emr/' + str(ts) + '/result' sc.stop()
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7" # Start a SparkSession import findspark findspark.init() from pyspark.sql import SparkSession spark = SparkSession.builder.appName( "FakeNewsPoject_Naive_Bayes").getOrCreate() from pyspark import SparkFiles # Load in Fake.csv from S3 into a DataFrame fake_url = "https://bootcamp-proj-3.s3.us-east-2.amazonaws.com/Fake.csv" spark.sparkContext.addFile(fake_url) raw_fake_df = spark.read.csv(SparkFiles.get("Fake.csv"), sep=",", header=True) # raw_fake_df.show(10) # Load in True.csv from S3 into a DataFrame true_url = "https://bootcamp-proj-3.s3.us-east-2.amazonaws.com/True.csv" spark.sparkContext.addFile(true_url) raw_true_df = spark.read.csv(SparkFiles.get("True.csv"), sep=",", header=True) # raw_true_df.show(10) import pyspark.sql.functions as sf # Add true/fake categories add_category_fake = raw_fake_df.withColumn('category', sf.lit('Fake')) add_category_true = raw_true_df.withColumn('category', sf.lit('True'))
def apply_network(dataset: str, serialized): hyperparams_route = SparkFiles.get(f'{dataset}_hyper.pkl') model_route = SparkFiles.get(f'{dataset}_model.pth') predictor = Predictor(hyperparams_route, model_route) return predictor.predict(serialized)
def crfprep(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") featureListFilename = os.path.join(crfConfigDir, "features.hair-eye") crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") crfExecutable = "/usr/local/bin/crf_test_filter.sh" crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model") rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() if limit: rdd_sequence_file_input = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)) rdd_json.setName('rdd_json') # rdd_json.persist() rdd_texts = rdd_json.mapValues(lambda x: (textTokens(extract_body(x)), textTokens(extract_title(x)))) rdd_texts.setName('rdd_texts') # data format issue? # rdd_texts.saveAsSequenceFile(outputDirectory + "_texts") # This separator could have appeared in original text, and should serve to cleanly delimit the body from the title # Not perfect, it could have appeared between real tokens # Needs to have single labels+index feature # former code was lost c = crf_features.CrfFeatures(featureListFilename) SEPARATOR = ' ', def makeMatrix(c, uri, bodyTokens, titleTokens): b = c.computeFeatMatrix(bodyTokens, False, addLabels=False, addIndex=False) s = c.computeFeatMatrix([SEPARATOR, ""], False, addLabels=False, addIndex=False) t = c.computeFeatMatrix(titleTokens, False, addLabels=False, addIndex=False) idx = 1 for row in b: if row == u"": pass else: label = uri + "/%05d/%05d" % (0, idx) row.append(label) idx += 1 idx = 1 for row in s: if row == u"": pass else: label = uri + "/%05d/%05d" % (1, idx) row.append(label) idx += 1 idx = 1 for row in t: if row == u"": pass else: label = uri + "/%05d/%05d" % (2, idx) row.append(label) idx += 1 # might be b[0:-1] + s[0:-1] + t? return b[0:-1] + s[0:-1] + t rdd_features = rdd_texts.map(lambda x: (x[0], makeMatrix(c, x[0], x[1][0], x[1][1]))) rdd_features.setName('rdd_features') # rdd_features.persist() rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToUTF8(x)).values() rdd_pipeinput.setName('rdd_pipeinput') if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename))) print "###CMD %s" % cmd rdd_crfoutput = rdd_pipeinput.pipe(cmd) rdd_crfoutput.setName('rdd_crfoutput') # rdd_features.persist() rdd_final = rdd_crfoutput if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
from pyspark.sql import Row from pyspark import SparkFiles from pyspark.sql.types import * from pyspark.ml import Pipeline print('xxxxxxxxxxxxxxxxx -PIPELINE- xxxxxxxxxxxxxxx #2') from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, OneHotEncoderEstimator from pyspark.sql.functions import col, countDistinct import pyspark.sql.functions as f from pyspark.sql.window import Window from pyspark.sql.functions import rank, col print('xxxxxxxxxxxxxxxxx -STRINGIndexer- xxxxxxxxxx #3') url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv" sc.addFile(url) Dataf = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema=True) print( 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx HEADS xxxxxxxxxxxx ' ) windowX = Window.partitionBy(Dataf['income']).orderBy(Dataf['age'].desc()) Dataf.select( 'income', 'gender', 'workclass', 'education', 'educational-num', 'income', rank().over(windowX).alias('pemba')).filter(col('pemba') <= 50000).show() print( 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx GROUPLIST xxxxxxxx ' ) Dataf.printSchema() print((Dataf.count(), len(Dataf.columns)))
def compute_buried_area_all_residues_and_receptor_area(pdb_complex): chZ = "chZ" res_buried_area_perc = -1 res_buried_area = -1 buried_receptor_system = -1 buried_receptor_res = -1 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) #output area receptor file f_output_receptor_buried_area = os.path.join(path_analysis_pdb_complex_b.value,base_name+".outAreaRecep") #ndx files #f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_ndx_temporary_index_z = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_index_z"+".ndx") f_ndx_temporary = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary"+".ndx") f_ndx_temporary_sasa = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa"+".ndx") #xvg files f_xvg_temporary_sasa_res_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res-lig"+".xvg") f_xvg_temporary_sasa_res = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res"+".xvg") f_xvg_temporary_sasa_rec_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec_lig"+".xvg") f_xvg_temporary_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_receptor = SparkFiles.get("make_ndx_buried_area_receptor.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_receptor + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_index_z + " "+ f_ndx_temporary process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() #coping file if os.path.exists(f_ndx_temporary): shutil.copy(f_ndx_temporary, f_ndx_temporary_sasa) #Get all residues for computing area receptor all_res = get_residues_receptor_from_ndx_files(f_ndx_temporary) returned_list = [] for res in all_res: script_make_ndx_buried_area_receptor_res = SparkFiles.get("make_ndx_buried_area_receptor_res.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_receptor_res + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_sasa + " "+ str(res) process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # compute surface of system - saved on xvg command = gromacs_path.value +"gmx sasa -surface complex -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # compute surface of receptor - save on xvg command = gromacs_path.value +"gmx sasa -surface rec -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() #calculate area if os.path.exists(f_xvg_temporary_sasa_res_lig): buried_receptor_system = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res_lig) else: buried_receptor_system = 0 if os.path.exists(f_xvg_temporary_sasa_res): buried_receptor_res = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res) else: buried_receptor_res = 0 res_buried_area = buried_receptor_res - buried_receptor_system if (res_buried_area > 0) and (buried_receptor_res > 0): res_buried_area_perc = res_buried_area/buried_receptor_res #Generating result result = (base_name, res, res_buried_area, res_buried_area_perc) returned_list.append(result) #Deleting files if os.path.exists(f_xvg_temporary_sasa_res_lig): os.remove(f_xvg_temporary_sasa_res_lig) if os.path.exists(f_xvg_temporary_sasa_res): os.remove(f_xvg_temporary_sasa_res) #Computing Receptor Area command = gromacs_path.value +"gmx sasa -surface complex -output rec"+ " -o "+ f_xvg_temporary_sasa_rec_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -surface rec -output rec"+ " -o "+ f_xvg_temporary_sasa_rec + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if os.path.exists(f_xvg_temporary_sasa_rec_lig): sasa_rec_lig = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec_lig) else: sasa_rec_lig = 0 if os.path.exists(f_xvg_temporary_sasa_rec): sasa_rec = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec) else: sasa_rec = 0 receptor_area = sasa_rec - sasa_rec_lig #Saving result file output_receptor_buried_area = open(f_output_receptor_buried_area, "w") output_receptor_buried_area.write(str(base_name)+" "+str(receptor_area) +"\n") output_receptor_buried_area.close() #Deleting all files if os.path.exists(f_xvg_temporary_sasa_rec_lig): os.remove(f_xvg_temporary_sasa_rec_lig) if os.path.exists(f_xvg_temporary_sasa_rec): os.remove(f_xvg_temporary_sasa_rec) if os.path.exists(f_ndx_temporary): os.remove(f_ndx_temporary) if os.path.exists(f_ndx_temporary_sasa): os.remove(f_ndx_temporary_sasa) if os.path.exists(f_ndx_temporary_index_z): os.remove(f_ndx_temporary_index_z) return returned_list else: #Here means that some problem for computing area return (base_name, "NAN", float(0), float(0))
# Can I perist a Caffe network object? import copy from pyspark import SparkContext, SparkConf from pyspark import SparkFiles from pyspark import StorageLevel conf = SparkConf().setAppName("SparkCaffe Test") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf=conf) sc.addFile("models/solver.prototxt") sc.addFile("models/train_val.prototxt") solver = SparkFiles.get("solver.prototxt") architecture = SparkFiles.get("train_val.prototxt") def create_net(solver_filename): from caffe import SGDSolver net = SGDSolver(str(solver_filename)).net return net netRDD = sc.parallelize([solver]*2, 2) \ .map(create_net) netRDD.persist(StorageLevel.MEMORY_ONLY) def extract_unique_val(net): return net.params['conv1'][0].data[0, 0, 0, 0]
def main(): conf = (SparkConf() .setMaster("local[*]") .setAppName("compare_engine")) sc = SparkContext(conf = conf) sc.setLogLevel('INFO') sc.addFile(primary) # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() rdd_primary.partitionBy(10).cache() os.system('rm -Rf collects_*') os.system('rm -Rf holder.txt') rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct() rdd_secondary.partitionBy(10).cache() primary_count = rdd_primary.count() primary_report['count'] = primary_count print(primary_report) secondary_count = rdd_secondary.count() secondary_report['count'] = secondary_count print(secondary_report) # Return each Primary file line/record not contained in Secondary not_in_primary = rdd_primary.subtract(rdd_secondary) primary_diff = not_in_primary.count() primary_report['diff'] = primary_diff os.system('rm -Rf collects_*.csv') primary_dir = 'collects_{}_primary'.format(run_date) primary_report_name = 'collects_{}_primary_report.csv'.format(run_date) not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir) # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date)) os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name)) os.system('wc -l collects_{}_primary_report.csv'.format(run_date)) # Flip Primary Vs Secondary # Return each Secondary file line/record not contained in Primary not_in_secondary = rdd_secondary.subtract(rdd_primary) secondary_diff = not_in_secondary.count() secondary_report['diff'] = secondary_diff not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date)) os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date)) os.system('wc -l collects_{}_secondary_report.csv'.format(run_date)) process_report['primary'] = primary_report process_report['secondary'] = secondary_report print("=" * 100) print('\n') print(process_report) print('\n') print("=" * 100) spark_details(sc) sc.stop()
THREDSHOLD = 0.6 sc = SparkContext.getOrCreate() sc.addFile( 's3://s3-cdp-prod-airflow-dag/1.10/artifacts/brandnorm/cqi_brand/python/ner/crf.model.5Feat_33018Pos_11350Neg' ) sc.addFile( 's3://s3-cdp-prod-airflow-dag/1.10/artifacts/brandnorm/cqi_brand/python/ner/brand_dict.txt' ) #sc.addPyFile('crfTaggerManager.py') spark = SparkSession(sc) with open(SparkFiles.get('brand_dict.txt'), 'r') as infile: brandDictMap = json.load(infile) from crf_title_tagger import extract_features # load logger from log_factory import * logger = get_prod_logger() def get_raw_df(): df = spark.read.orc('s3://s3-cdp-prod-hive/temp/cqi_item_brand_field_no_extraction/')\ .select('item_id','original_category_codes','level_one_category_codes','level_two_category_codes','original_product_name','original_brand')\ .toDF('itemId','originalCategory','levelOneCategory','levelTwoCategory','originalProductName','originalbrand') return df
def driver(sc, inputFilename, outputDirectory, crfExecutable, crfScript, featureListFilename, crfModelFilename, eyeColorRef, eyeColorConfig, hairRef, hairConfig, limit=limit, location='hdfs', outputFormat="text", partitions=None): dump = False partitions = None # Program to compute CRF++ c = crf_features.CrfFeatures(featureListFilename) # Add files to be downloaded with this Spark job on every node. sc.addFile(crfExecutable) sc.addFile(crfScript) sc.addFile(crfModelFilename) # Map to reference sets smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig) smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig) if location == "hdfs": print "We want to do hdfs dfs -rm -r %s" % outputDirectory elif location == "local": try: shutil.rmtree(outputDirectory) print "rmtree %s" % outputDirectory except: pass else: raise RuntimeError("No such location: %s" % location) rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() origSize = rdd_sequence_file_input.count() # if limit: # rdd = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions()) rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)) rdd_json.setName('rdd_json') # rdd_json.persist() # all below should also be done for title rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body.setName('rdd_body') # rdd_body.persist() if dump: rdd_body.saveAsTextFile(ff("body")) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x)) rdd_body_tokens.setName('rdd_body_tokens') # rdd_body_tokens.persist() if dump: rdd_body_tokens.saveAsTextFile(ff("body_tokens")) rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))) rdd_features.setName('rdd_features') # rdd_features.persist() if dump: rdd_features.saveAsTextFile(ff("features")) # rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x))) rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x)) rdd_pipeinput.setName('rdd_pipeinput') # rdd_pipeinput.persist() if dump: rdd_pipeinput.values().saveAsTextFile(ff("pi")) # This caused a cannot concatenate string + None error # rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput") # DON'T USE SparkFiles.get to fetch the crf_test or model # This only works with local Spark (--master local[*]) if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd rdd_pipeinput.saveAsTextFile(ff("before")) exit(0) rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd) rdd_crf_b64.setName('rdd_crf_b64') # rdd_crf_b64.persist() if dump: rdd_crf_b64.saveAsTextFile(ff("po")) # Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors, # with empty string suffix indicating blank line # This is key for avoiding the groupBy step rdd_restore = rdd_crf_b64.map(lambda x: restore(x)) rdd_restore.setName('rdd_restore') # rdd_restore.persist() if dump: rdd_restore.saveAsTextFile(ff("restore")) # ### WE NO LONGER HAVE TO GROUPBY # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW # rdd_withuri = sc.parallelize(rdd_withuri.take(10)) rdd_harvested = rdd_restore.mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]) rdd_harvested.setName('rdd_harvested') # rdd_harvested.persist() if dump: rdd_harvested.saveAsTextFile(ff("harvested")) # This has the effect of generating 0, 1, 2, ... lines according to the number of spans rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x)) rdd_controlled.setName('rdd_controlled') # rdd_controlled.persist() # map any eyeColor spans using smEyeColor, hairType spans using smHairColor rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})) rdd_aligned.setName('rdd_aligned') # rdd_aligned.persist() if dump: rdd_aligned.saveAsTextFile(ff("aligned")) rdd_aligned_json = rdd_aligned.mapValues(lambda x: json.dumps(x)) rdd_aligned_json.setName('rdd_aligned_json') # rdd_aligned_json.persist() if dump: rdd_aligned_json.saveAsTextFile(ff("aligned_json")) rdd_final = rdd_aligned_json empty = rdd_final.isEmpty() if not empty: l = "unknown>1" print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory) # print len(rdd_final.collect()) if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat) else: print "### No records: no output into %s" % (outputDirectory)
h2oContext = H2OContext(sc).start() # Define file names chicagoAllWeather = "chicagoAllWeather.csv" chicagoCensus = "chicagoCensus.csv" chicagoCrimes10k = "chicagoCrimes10k.csv" # Add files to Spark Cluster sc.addFile(_locate(chicagoAllWeather)) sc.addFile(_locate(chicagoCensus)) sc.addFile(_locate(chicagoCrimes10k)) # Since we have already loaded files into spark, we have to use h2o.upload_file instead of h2o.import_file since # h2o.import_file expects cluster-relative path (ie. the file on this path can be accessed from all the machines on the cluster) # but SparkFiles.get(..) already give us relative path to the file on a current node which h2o.upload_file can handle ( it uploads file # located on current node and distributes it to the H2O cluster) f_weather = h2o.upload_file(SparkFiles.get(chicagoAllWeather)) f_census = h2o.upload_file(SparkFiles.get(chicagoCensus)) f_crimes = h2o.upload_file(SparkFiles.get(chicagoCrimes10k)) # Transform weather table # Remove 1st column (date) f_weather = f_weather[1:] # Transform census table # Remove all spaces from column names (causing problems in Spark SQL) col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names) # Update column names in the table # f_weather.names = col_names f_census.names = col_names
# ## Dataframe Basics # In[1]: # Import our SparkSession so we can use it from pyspark.sql import SparkSession # Create our SparkSession, this can take a couple minutes locally spark = SparkSession.builder.appName("basics").getOrCreate() # In[ ]: # Load in data from pyspark import SparkFiles url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/food.csv" spark.sparkContext.addFile(url) df = spark.read.csv(SparkFiles.get("food.csv"), sep=",", header=True) df.show() # In[4]: # Let's show the data df.show() # In[5]: # Print our schema df.printSchema() # In[6]: # Show the columns
sc = SparkContext('local', 'testGeoSpark') #X = sys.argv[1] #normal normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' normalPath = os.path.join(normalFilePath) sc.addFile(normalPath); #attack attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' attackPath = os.path.join(attackFilePath) sc.addFile(attackPath) from pyspark import SparkFiles normalRdd = sc.textFile(SparkFiles.get(normalFilePath)) attackRdd = sc.textFile(SparkFiles.get(attackFilePath)) import geoip2.database geoDBpath = '/home/worker/workspace/geoDB/GeoLite2-City.mmdb' geoPath = os.path.join(geoDBpath) sc.addFile(geoPath) #reader = geoip2.database.Reader(SparkFiles.get(geoPath)) #reader = geoip2.database.Reader('GeoLite2-City.mmdb') # def ip2city(ip): # try: # city = reader.city(ip).city.name # except: # city = 'not found'
def send2monit(data): """ Helper function which wraps StompAMQ and incoming dataframe into notification message. Then it sends it to AMQ end-point provided by credentials file. """ import os import stomp import time import uuid import logging class StompyListener(object): """ Auxiliar listener class to fetch all possible states in the Stomp connection. """ def __init__(self): self.logr = logging.getLogger(__name__) def on_connecting(self, host_and_port): self.logr.info('on_connecting %s', str(host_and_port)) def on_error(self, headers, message): self.logr.info('received an error %s %s', str(headers), str(message)) def on_message(self, headers, body): self.logr.info('on_message %s %s', str(headers), str(body)) def on_heartbeat(self): self.logr.info('on_heartbeat') def on_send(self, frame): self.logr.info('on_send HEADERS: %s, BODY: %s ...', str(frame.headers), str(frame.body)[:160]) def on_connected(self, headers, body): self.logr.info('on_connected %s %s', str(headers), str(body)) def on_disconnected(self): self.logr.info('on_disconnected') def on_heartbeat_timeout(self): self.logr.info('on_heartbeat_timeout') def on_before_message(self, headers, body): self.logr.info('on_before_message %s %s', str(headers), str(body)) return (headers, body) class StompAMQ(object): """ Class to generate and send notifications to a given Stomp broker and a given topic. :param username: The username to connect to the broker. :param password: The password to connect to the broker. :param producer: The 'producer' field in the notification header :param topic: The topic to be used on the broker :param host_and_ports: The hosts and ports list of the brokers. E.g.: [('agileinf-mb.cern.ch', 61213)] """ # Version number to be added in header _version = '0.1' def __init__(self, username, password, producer='CMS_WMCore_StompAMQ', topic='/topic/cms.jobmon.wmagent', host_and_ports=None, verbose=0): self._host_and_ports = host_and_ports or [ ('agileinf-mb.cern.ch', 61213) ] self._username = username self._password = password self._producer = producer self._topic = topic self.verbose = verbose def send(self, data): """ Connect to the stomp host and send a single notification (or a list of notifications). :param data: Either a single notification (as returned by `make_notification`) or a list of such. :return: a list of successfully sent notification bodies """ conn = stomp.Connection(host_and_ports=self._host_and_ports) conn.set_listener('StompyListener', StompyListener()) try: conn.start() conn.connect(username=self._username, passcode=self._password, wait=True) except stomp.exception.ConnectFailedException as exc: print("ERROR: Connection to %s failed %s" % (repr(self._host_and_ports), str(exc))) return [] # If only a single notification, put it in a list if isinstance(data, dict) and 'topic' in data: data = [data] successfully_sent = [] for notification in data: body = self._send_single(conn, notification) if body: successfully_sent.append(body) if conn.is_connected(): conn.disconnect() print('Sent %d docs to %s' % (len(successfully_sent), repr(self._host_and_ports))) return successfully_sent def _send_single(self, conn, notification): """ Send a single notification to `conn` :param conn: An already connected stomp.Connection :param notification: A dictionary as returned by `make_notification` :return: The notification body in case of success, or else None """ try: body = notification.pop('body') destination = notification.pop('topic') conn.send(destination=destination, headers=notification, body=json.dumps(body), ack='auto') if self.verbose: print('Notification %s sent' % str(notification)) return body except Exception as exc: print('ERROR: Notification: %s not send, error: %s' % \ (str(notification), str(exc))) return None def make_notification(self, payload, id_, producer=None): """ Generate a notification with the specified data :param payload: Actual notification data. :param id_: Id representing the notification. :param producer: The notification producer. Default: StompAMQ._producer :return: the generated notification """ producer = producer or self._producer notification = {} notification['topic'] = self._topic # Add headers headers = { 'type': 'cms_wmagent_info', 'version': self._version, 'producer': producer } notification.update(headers) # Add body consisting of the payload and metadata body = { 'payload': payload, 'metadata': { 'timestamp': int(time.time()), 'id': id_, 'uuid': str(uuid.uuid1()), } } notification['body'] = body return notification # main function logic with open(SparkFiles.get('amq_broker.json')) as istream: creds = json.load(istream) host, port = creds['host_and_ports'].split(':') port = int(port) amq = StompAMQ(creds['username'], creds['password'], \ creds['producer'], creds['topic'], [(host, port)]) arr = [] for idx, row in enumerate(data): # if not idx: # print("### row", row, type(row)) doc = json.loads(row) hid = doc.get("hash", 1) arr.append(amq.make_notification(doc, hid)) amq.send(arr) print("### Send %s docs to CERN MONIT" % len(arr))
def mapper(line, title, secfile, idsec): post = mdb.posts tokens = word_tokenize(line) tagged = pos_tag(tokens) ntities = chunk.ne_chunk(tagged) newline = line.encode('utf-8') posting = {"securitynow_id": idsec, "episode": secfile[3:6], "speaker": title, "original": line, "tokens": tokens, "entities": ntities, "sentiment": classifier.classify(dict([(word, True) for word in newline]))} post_id = post.insert(posting) sc.addFile("/home/th3m4d0n3/NetBeansProjects/twAppDemo/data_dir/allSentimentData") with open(SparkFiles.get("allSentimentData")) as f: reader = csv.reader(f, delimiter=" ", quotechar='"') jobs = bg.BackgroundJobManager() map(parseForNltk, reader) print("chezdata type DATA: {0} COUNT: {1}".format(type(chezdata), len(chezdata))) map(getHighest, chezdata) chezdataP = sc.parallelize(chezdata) lowRatedP = sc.parallelize(lowRated) highlyRatedP = sc.parallelize(highlyRated) print("chezdataP type DATA: {0} COUNT: {1}".format(type(chezdataP), chezdataP.count())) print("lowRatedP type DATA: {0} COUNT: {1}".format(type(lowRatedP), lowRatedP.count()))
# Can I perist a Caffe network object? import copy from pyspark import SparkContext, SparkConf from pyspark import SparkFiles from pyspark import StorageLevel conf = SparkConf().setAppName("SparkCaffe Test") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf=conf) sc.addFile("models/solver.prototxt") sc.addFile("models/train_val.prototxt") solver = SparkFiles.get("solver.prototxt") architecture = SparkFiles.get("train_val.prototxt") def create_net(solver_filename): from caffe import SGDSolver net = SGDSolver(str(solver_filename)).net return net netRDD = sc.parallelize([solver] * 2, 2).map(create_net) netRDD.persist(StorageLevel.MEMORY_ONLY) def extract_unique_val(net): return net.params["conv1"][0].data[0, 0, 0, 0]
# -*- coding: utf-8 -*- import os from pyspark import SparkContext, SparkFiles # apache spark test job # Write a Spark job to count the occurrences of each word in a text file. Document that it works with a small example. CONNECTION_STR = "spark://" + os.environ[ "SPARK_MASTER_PORT_7077_TCP_ADDR"] + ":" + os.environ[ "SPARK_MASTER_ENV_SPARK_MASTER_PORT"] sc = SparkContext(CONNECTION_STR, "test") sc.addFile( os.path.join(os.path.dirname(os.path.realpath(__file__)), "exercise1_data.txt")) with open(SparkFiles.get("exercise1_data.txt")) as testFile: lines = sc.parallelize(testFile.readlines()) words = lines.flatMap(lambda s: s.split()) #Splitting the lines into words pairs = words.map( lambda s: (s, 1) ) #Creating a pair for each word in the form (word, count) where count is the occurrence of the word, set to 1 counts = pairs.reduceByKey(lambda a, b: a + b) #Counting the words res = sorted(counts.collect(), key=lambda tup: tup[1], reverse=True) #Get the result and sort it based on count for (word, count) in res: print("'{0}' has an occurrences of {1}".format( word, count)) #Print out the result
sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.shuffle.partitions', '7') from pyspark.sql import SparkSession import pyspark.sql as sparksql from pyspark import SparkFiles spark = SparkSession.builder.appName('stroke').getOrCreate() from google.colab import drive drive.mount('/content/gdrive') root_path = 'gdrive/My Drive/' train = sqlContext.read.csv(SparkFiles.get("/content/"+root_path+"train_2v.csv"), header=True, inferSchema=True) # explain about Infer Schema train.groupBy('label').count().show() train.printSchema() train.dtypes train.describe().show() # create DataFrame as a temporary view for SQL queries train.createOrReplaceTempView('table')
def fun(iterable): with open(SparkFiles.get('num_data')) as f: value = int(f.readline()) return [x * value for x in iterable]
distScript = os.getcwd()+"/src/R/finddistance.R" distScriptName = "finddistance.R" sc.addFile(distScript) def hasDistInfo(call): """Verify that a call has the fields required to compute the distance""" requiredFields = ["mylat", "mylong", "contactlat", "contactlong"] return all(map(lambda f: call[f], requiredFields)) def formatCall(call): """Format a call so that it can be parsed by our R program""" return "{0},{1},{2},{3}".format( call["mylat"], call["mylong"], call["contactlat"], call["contactlong"]) pipeInputs = contactsContactList.values().flatMap( lambda calls: map(formatCall, filter(hasDistInfo, calls))) distances = pipeInputs.pipe(SparkFiles.get(distScriptName)) print distances.collect() # Convert our RDD of strings to numeric data so we can compute stats and # remove the outliers. distanceNumerics = distances.map(lambda string: float(string)) stats = distanceNumerics.stats() stddev = stats.stdev() mean = stats.mean() reasonableDistances = distanceNumerics.filter( lambda x: math.fabs(x - mean) < 3 * stddev) print reasonableDistances.collect()
def create_low_test_skycomponents_from_gleam(flux_limit=0.1, polarisation_frame=PolarisationFrame("stokesI"), frequency=numpy.array([1e8]), kind='cubic', phasecentre=None, radius=1.0, nchan=16) \ -> List[Skycomponent]: """Create sky components from the GLEAM survey Stokes I is estimated from a cubic spline fit to the measured fluxes. The polarised flux is always zero. See http://www.mwatelescope.org/science/gleam-survey The catalog is available from Vizier. VIII/100 GaLactic and Extragalactic All-sky MWA survey (Hurley-Walker+, 2016) GaLactic and Extragalactic All-sky Murchison Wide Field Array (GLEAM) survey. I: A low-frequency extragalactic catalogue. Hurley-Walker N., et al., Mon. Not. R. Astron. Soc., 464, 1146-1167 (2017), 2017MNRAS.464.1146H :rtype: Union[None, List[arl.data.data_models.Skycomponent], List] :param flux_limit: Only write components brighter than this (Jy) :param polarisation_frame: Polarisation frame (default PolarisationFrame("stokesI")) :param frequency: Frequencies at which the flux will be estimated :param kind: Kind of interpolation (see scipy.interpolate.interp1d) Default: linear :param phasecentre: Desired phase centre (SkyCoord) default None implies all sources :param radius: Radius of sources selected around phasecentre (default 1.0 rad) :return: List of Skycomponents """ # fitsfile = arl_path("data/models/GLEAM_EGC.fits") # # rad2deg = 180.0 / numpy.pi # decmin = phasecentre.dec.to('deg').value - rad2deg * radius / 2.0 # decmax = phasecentre.dec.to('deg').value + rad2deg * radius / 2.0 # # hdulist = fits.open(fitsfile, lazy_load_hdus=False) # recs = hdulist[1].data[0].array # # # Do the simple forms of filtering in pyfits. Filtering on radious is done below. # fluxes = recs['peak_flux_wide'] # # mask = fluxes > flux_limit # filtered_recs = recs[mask] # # decs = filtered_recs['DEJ2000'] # mask = decs > decmin # filtered_recs = filtered_recs[mask] # # decs = filtered_recs['DEJ2000'] # mask = decs < decmax # filtered_recs = filtered_recs[mask] # # ras = filtered_recs['RAJ2000'] # decs = filtered_recs['DEJ2000'] # names = filtered_recs['Name'] # # if polarisation_frame is None: # polarisation_frame = PolarisationFrame("stokesI") # # npol = polarisation_frame.npol # # nchan = len(frequency) # # # For every source, we read all measured fluxes and interpolate to the # # required frequencies # gleam_freqs = numpy.array([76, 84, 92, 99, 107, 115, 122, 130, 143, 151, 158, 166, 174, 181, 189, 197, 204, # 212, 220, 227]) # gleam_flux_freq = numpy.zeros([len(names), len(gleam_freqs)]) # for i, f in enumerate(gleam_freqs): # gleam_flux_freq[:, i] = filtered_recs['int_flux_%03d' % (f)][:] # # skycomps = [] # # for isource, name in enumerate(names): # direction = SkyCoord(ra=ras[isource] * u.deg, dec=decs[isource] * u.deg) # if phasecentre is None or direction.separation(phasecentre).to('rad').value < radius: # # fint = interpolate.interp1d(gleam_freqs * 1.0e6, gleam_flux_freq[isource, :], kind=kind) # flux = numpy.zeros([nchan, npol]) # flux[:, 0] = fint(frequency) # if not numpy.isnan(flux).any(): # skycomps.append(Skycomponent(direction=direction, flux=flux, frequency=frequency, # name=name, shape='Point', # polarisation_frame=polarisation_frame)) # # log.info('create_low_test_skycomponents_from_gleam: %d sources above flux limit %.3f' % (len(skycomps), flux_limit)) # # hdulist.close() with open(SparkFiles.get("sc" + str(nchan)), "rb") as f: skycomps = pickle.load(f) f.close() return skycomps
import spa_utils from pyspark.sql import SparkSession import pyspark.sql.functions as F from pyspark import SparkFiles from pyspark.sql.types import StructType from pyspark.sql.types import StructField import pyspark.sql.types as sql_type app_name = os.path.basename(__file__) spark = SparkSession.builder.appName( app_name).enableHiveSupport().getOrCreate() spark.sparkContext.addPyFile('logging.conf') spark.sparkContext.addPyFile('params.yaml') spark.sparkContext.addPyFile('SPA_baseline_functions.py') logging.config.fileConfig(SparkFiles.get('logging.conf')) # 读取logging的配置信息 logger = logging.getLogger(app_name) # logger名称与任务名称相同 # 读取任务参数 params = spa_utils.load_params() logger.info('parameter file loaded') logger.debug(params) # 命令行参数有更高的优先级 if len(sys.argv) >= 2: params['update_origin'] = sys.argv[1] params['update_end'] = sys.argv[1] if len(sys.argv) >= 3: params['update_end'] = sys.argv[2] if len(sys.argv) >= 4: params['write_mode'] = sys.argv[3]
from pyspark import SparkContext from pyspark import SparkFiles finddistance = "/home/maria_dev/finddistance.R" finddistancename = "finddistance.R" sc = SparkContext("local", "SparkFile App") sc.addFile(finddistance) print "Absolute Path -> %s" % SparkFiles.get(finddistancename)
def upload_file_job(context): from pyspark import SparkFiles with open(SparkFiles.get(upload_file_name)) as testFile: file_val = testFile.readline() return file_val
import matplotlib.pyplot as plt import numpy as np from pandas.plotting import autocorrelation_plot from pyspark.ml.regression import LinearRegression from datetime import timedelta, datetime import time spark = SparkSession.builder \ .master("local[1]") \ .appName("Vaccinations") \ .getOrCreate() # LOADING DATA url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/us_state_vaccinations.csv" spark.sparkContext.addFile(url) df = spark.read.csv("file://" + SparkFiles.get("us_state_vaccinations.csv"), header=True, inferSchema=True).select("date", "location", "daily_vaccinations") # ONLY INCLUDES ROWS WHERE date, location, and daily_vaccinations are present print("all rows", df.count()) df = df.na.drop() print("only non null", df.count()) # CONVERT TO DATE TYPE df = df.select('*', unix_timestamp(df.date.cast('date')).alias('time')) # GET LIST OF ALL STATES states = df.rdd.map(lambda x: x.location).distinct().collect()
# get(filename) # It specifies the path of the file that is added through SparkContext.addFile(). # getrootdirectory() # It specifies the path to the root directory, which contains the file that is added through the SparkContext.addFile(). from pyspark import SparkContext from pyspark import SparkFiles file = "/Users/zhangyong/pyspark_learning/README.md" filename = "README.md" sc = SparkContext("local", "SparkFile App") sc.addFile(file) print "Absolute Path -> %s" % SparkFiles.get(filename)
# Import library import pyspark from pyspark.sql import SQLContext from pyspark import SparkFiles # Setting pyspark to variable to leverage its functionality sc = pyspark.SparkContext() sqlContext = SQLContext(sc) # Reading data file in data frame in the form of RDD (Resilient Distributed Dataset) df = sqlContext.read.csv(SparkFiles.get("D:/Masters/KDM/data.csv"), header=True, inferSchema=True) # RDD Actions # Collect all the information present in the data set action_1 = df.collect() print("Action 1: Collecting all the information present within the data set") print(action_1) # Count number of elements in the data set action_2 = df.count() print("Action 2: Count the number of data points present within the data set") print(action_2) # Return first 'n' number of elements from the data set action_3 = df.take(2) print( "Action 3: Taking out 'n' number of data points from the entire data set") print(action_3) # RDD Transformations with Actions # Grouping data and counting their total based on the 'Contract' feature
def _getCountryByIP(ip): citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb')) return (citydb.city(ip).country.name or u'Unknown').encode()
def _resolvepath(self, p): if self._conf['use_sparkfiles']: from pyspark import SparkFiles return SparkFiles.get(p) else: return p
#X = sys.argv[1] #normal normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' normalPath = os.path.join(normalFilePath) sc.addFile(normalPath); #attack attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' attackPath = os.path.join(attackFilePath) sc.addFile(attackPath); from pyspark import SparkFiles normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache() attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache() # src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6) normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache() attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache() #(ip, count) normalTopXSrcIP = normalRaw.map(lambda x:(x[0], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) attackTopXSrcIP = attackRaw.map(lambda x:(x[0], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) #(ip, count) normalTopXDstIP = normalRaw.map(lambda x:(x[1], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) attackTopXDstIP = attackRaw.map(lambda x:(x[1], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) #(ip, data_length)
def find_neighbors(iter): t = AnnoyIndex(rank) t.load(SparkFiles.get("index.ann")) # search_k return ((x[0] - 1, t.get_nns_by_item(x[0] - 1, int(number_recs))) for x in iter)
def driver(sc, inputFilename, outputDirectory, crfExecutable, crfScript, featureListFilename, crfModelFilename, eyeColorRef, eyeColorConfig, hairRef, hairConfig, limit=limit, location='hdfs', outputFormat="text", partitions=None): dump = False partitions = 8 # Program to compute CRF++ c = crf_features.CrfFeatures(featureListFilename) # Add files to be downloaded with this Spark job on every node. sc.addFile(crfExecutable) sc.addFile(crfScript) sc.addFile(crfModelFilename) # Map to reference sets smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig) smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig) if location == "hdfs": print "We want to do hdfs dfs -rm -r %s" % outputDirectory elif location == "local": try: shutil.rmtree(outputDirectory) print "rmtree %s" % outputDirectory except: pass else: raise RuntimeError("No such location: %s" % location) rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() origSize = rdd_sequence_file_input.count() # if limit: # rdd = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions()) if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd # ### WE NO LONGER HAVE TO GROUPBY # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW # rdd_withuri = sc.parallelize(rdd_withuri.take(10)) rdd_final = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)).mapValues(lambda x: extract_body(x)).mapValues(lambda x: textTokens(x)).map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))).mapValues(lambda x: base64.b64encode(vectorToString(x))).values().pipe(cmd).map(lambda x: restore(x)).mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]).flatMapValues(lambda x: list(x)).mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})).mapValues(lambda x: json.dumps(x)) empty = rdd_final.isEmpty() if not empty: l = "unknown>1" print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory) print len(rdd_final.collect()) # if outputFormat == "sequence": # rdd_final.saveAsSequenceFile(outputDirectory) # elif outputFormat == "text": # rdd_final.saveAsTextFile(outputDirectory) # else: # raise RuntimeError("Unrecognized output format: %s" % outputFormat) else: print "### No records: no output into %s" % (outputDirectory)
distScript = os.getcwd() + "/src/R/finddistance.R" distScriptName = "finddistance.R" sc.addFile(distScript) def hasDistInfo(call): """Verify that a call has the fields required to compute the distance""" requiredFields = ["mylat", "mylong", "contactlat", "contactlong"] return all(map(lambda f: call[f], requiredFields)) def formatCall(call): """Format a call so that it can be parsed by our R program""" return "{0},{1},{2},{3}".format(call["mylat"], call["mylong"], call["contactlat"], call["contactlong"]) pipeInputs = contactsContactList.values().flatMap( lambda calls: map(formatCall, filter(hasDistInfo, calls))) distances = pipeInputs.pipe(SparkFiles.get(distScriptName)) print distances.collect() # Convert our RDD of strings to numeric data so we can compute stats and # remove the outliers. distanceNumerics = distances.map(lambda string: float(string)) stats = distanceNumerics.stats() stddev = stats.stdev() mean = stats.mean() reasonableDistances = distanceNumerics.filter( lambda x: math.fabs(x - mean) < 3 * stddev) print reasonableDistances.collect()
def create_named_configuration(name: str = 'LOWBD2', **kwargs) -> Configuration: """ Standard configurations e.g. LOWBD2, MIDBD2 :param name: name of Configuration LOWBD2, LOWBD1, LOFAR, VLAA :param rmax: Maximum distance of station from the average (m) :return: For LOWBD2, setting rmax gives the following number of stations 100.0 13 300.0 94 1000.0 251 3000.0 314 10000.0 398 30000.0 476 100000.0 512 """ if name == 'LOWBD2': location = EarthLocation(lon="116.4999", lat="-26.7000", height=300.0) fc = create_configuration_from_file(antfile=arl_path("LOWBD2.csv"), location=location, mount='xy', names='LOWBD2_%d', diameter=35.0, **kwargs) elif name == 'LOWBD1': location = EarthLocation(lon="116.4999", lat="-26.7000", height=300.0) fc = create_configuration_from_file( antfile=SparkFiles.get("LOWBD1.csv"), location=location, mount='xy', names='LOWBD1_%d', diameter=35.0, **kwargs) elif name == 'LOWBD2-CORE': location = EarthLocation(lon="116.4999", lat="-26.7000", height=300.0) fc = create_configuration_from_file( antfile=arl_path("LOWBD2-CORE.csv"), location=location, mount='xy', names='LOWBD2_%d', diameter=35.0, **kwargs) elif name == 'LOFAR': assert get_parameter(kwargs, "meta", False) is False fc = create_LOFAR_configuration( antfile=arl_path("data/configurations/LOFAR.csv")) elif name == 'VLAA': location = EarthLocation(lon="-107.6184", lat="34.0784", height=2124.0) fc = create_configuration_from_file( antfile=arl_path("data/configurations/VLA_A_hor_xyz.csv"), location=location, mount='altaz', names='VLA_%d', diameter=25.0, **kwargs) elif name == 'VLAA_north': location = EarthLocation(lon="-107.6184", lat="90.000", height=2124.0) fc = create_configuration_from_file( antfile=arl_path("data/configurations/VLA_A_hor_xyz.csv"), location=location, mount='altaz', names='VLA_%d', diameter=25.0, **kwargs) else: raise ValueError("No such Configuration %s" % name) return fc
spark.sparkContext.addFile(dataPath + 'data-payment_lookup-csv.csv') spark.sparkContext.addFile(dataPath + 'data-vendor_lookup-csv.csv') spark.sparkContext.addFile( dataPath + 'data-sample_data-nyctaxi-trips-2009-json_corrigido.json') spark.sparkContext.addFile( dataPath + 'data-sample_data-nyctaxi-trips-2010-json_corrigido.json') spark.sparkContext.addFile( dataPath + 'data-sample_data-nyctaxi-trips-2011-json_corrigido.json') spark.sparkContext.addFile( dataPath + 'data-sample_data-nyctaxi-trips-2012-json_corrigido.json') # ### Leitura e Correçao da fonte Payment # In[7]: df_payment = spark.read.csv(SparkFiles.get("data-payment_lookup-csv.csv"), header=True, sep=",") df_payment.show(3) # Verificado que a primeira linha precisa ser ignorada. Inclusao de index para auxiliar a correção. # # Utilização do Pandas para a leitura do CSV ignorando a linha de index 0. # In[8]: temp = pd.read_csv(SparkFiles.get("data-payment_lookup-csv.csv"), skiprows=[0], sep=',', header=None) temp.head()