def subQuery(ct): serialized = example.Serialized() example.ReadSerializationFromFile(SparkFiles.get('cryptocontext.cc'), serialized) cryptoContext = example.CryptoContextFactory.DeserializeAndCreateContext(serialized, False) example.ReadSerializationFromFile(SparkFiles.get('query.enc'), serialized) queryCT = cryptoContext.deserializeCiphertext(serialized) example.StringToSerialization(ct, serialized) dfCT = cryptoContext.deserializeCiphertext(serialized) result = cryptoContext.EvalSub(dfCT, queryCT) """ # Verify that one of them subs to 0 example.ReadSerializationFromFile(SparkFiles.get('key.sec'), serialized) secKey = cryptoContext.deserializeSecretKey(serialized) plaintextDec = example.IntPlaintextEncoding([]) decryptResult = cryptoContext.Decrypt(secKey, [result], plaintextDec, True) plaintextDec = example.decodeInts(plaintextDec) plaintextDec = plaintextDec[:decryptResult.messageLength] print(all(v == 0 for v in plaintextDec)) """ result.Serialize(serialized) return example.SerializationToString(serialized, '')[1]
def test_add_file_locally(self): path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") self.sc.addFile(path) download_path = SparkFiles.get("hello.txt") self.assertNotEqual(path, download_path) with open(download_path) as test_file: self.assertEquals("Hello World!\n", test_file.readline())
def get_or_extract(archive_path): """Given a path returned by add_local_model(), this method will return a tuple of (loaded_model, local_model_path). If this Python process ever loaded the model before, we will reuse that copy. """ from pyspark.files import SparkFiles import zipfile if archive_path in _SparkDirectoryDistributor._extracted_dir_paths: return _SparkDirectoryDistributor._extracted_dir_paths[ archive_path] # BUG: Despite the documentation of SparkContext.addFile() and SparkFiles.get() in Scala # and Python, it turns out that we actually need to use the basename as the input to # SparkFiles.get(), as opposed to the (absolute) path. archive_path_basename = os.path.basename(archive_path) local_path = SparkFiles.get(archive_path_basename) temp_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(local_path, "r") zip_ref.extractall(temp_dir) zip_ref.close() _SparkDirectoryDistributor._extracted_dir_paths[ archive_path] = temp_dir return _SparkDirectoryDistributor._extracted_dir_paths[archive_path]
def mult(ct1, ct2): serialized = example.Serialized() example.ReadSerializationFromFile(SparkFiles.get('cryptocontext.cc'), serialized) cryptoContext = example.CryptoContextFactory.DeserializeAndCreateContext(serialized, False) ct1 = ct1.value example.StringToSerialization(ct1, serialized) ct1 = cryptoContext.deserializeCiphertext(serialized) ct2 = ct2.value example.StringToSerialization(ct2, serialized) ct2 = cryptoContext.deserializeCiphertext(serialized) result = cryptoContext.EvalMult(ct1, ct2) """ # Verify that one of them mults to 0 example.ReadSerializationFromFile(SparkFiles.get('key.sec'), serialized) secKey = cryptoContext.deserializeSecretKey(serialized) plaintextDec = example.IntPlaintextEncoding([]) decryptResult = cryptoContext.Decrypt(secKey, [result], plaintextDec, True) plaintextDec = example.decodeInts(plaintextDec) plaintextDec = plaintextDec[:decryptResult.messageLength] print(all(v == 0 for v in plaintextDec)) """ result.Serialize(serialized) return Row(value=example.SerializationToString(serialized, '')[1])
def predict_map(rdd): from pyspark.files import SparkFiles config = bc_config.value fmap = bc_fmap.value static_dir = SparkFiles.get(bc_static_model_dir.value) ckpt_dir = SparkFiles.get("ckpt") from dlflow.mgr import Collector, model collect = Collector() collect(static_dir, "Static models") input_cls = model[config.MODEL.input_name] dataset = input_cls(fmap).rdd_inputs(rdd, config.MODEL.batch_size) model_cls = model[config.MODEL.model_name] model_ins = model_cls(fmap) model_ins.load_model(ckpt_dir) return model_ins.predict_act(dataset)
def execute(self): """ Overrides the execute method of the Reader class to read the file from Http/Https location. :return: pyspark.sql.DataFrame Returns a Spark's DataFrame with the data from the specified file. """ try: spark_context = self.spark.sparkContext spark_context.addFile(self.location) return self.spark.read.format(self.file_format) \ .load(SparkFiles.get(self.location.split('/')[-1])) except AnalysisException as exp: raise
def get_or_load(archive_path): """Given a path returned by add_local_model(), this method will return the loaded model. If this Python process ever loaded the model before, we will reuse that copy. """ if archive_path in SparkModelCache._models: SparkModelCache._cache_hits += 1 return SparkModelCache._models[archive_path] local_path = SparkFiles.get(archive_path) temp_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(local_path, 'r') zip_ref.extractall(temp_dir) zip_ref.close() # We must rely on a supposed cyclic import here because we want this behavior # on the Spark Executors (i.e., don't try to pickle the load_model function). from mlflow.pyfunc import load_pyfunc # pylint: disable=cyclic-import SparkModelCache._models[archive_path] = load_pyfunc(temp_dir) return SparkModelCache._models[archive_path]
def get_or_load(archive_path): """Given a path returned by add_local_model(), this method will return the loaded model. If this Python process ever loaded the model before, we will reuse that copy. """ if archive_path in SparkModelCache._models: SparkModelCache._cache_hits += 1 return SparkModelCache._models[archive_path] # BUG: Despite the documentation of SparkContext.addFile() and SparkFiles.get() in Scala # and Python, it turns out that we actually need to use the basename as the input to # SparkFiles.get(), as opposed to the (absolute) path. archive_path_basename = os.path.basename(archive_path) local_path = SparkFiles.get(archive_path_basename) temp_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(local_path, 'r') zip_ref.extractall(temp_dir) zip_ref.close() # We must rely on a supposed cyclic import here because we want this behavior # on the Spark Executors (i.e., don't try to pickle the load_model function). from mlflow.pyfunc import load_pyfunc # pylint: disable=cyclic-import SparkModelCache._models[archive_path] = load_pyfunc(temp_dir) return SparkModelCache._models[archive_path]
def get_file_path(name): return Path(SparkFiles.get(name))
from pyspark.sql import SparkSession from pyspark.files import SparkFiles from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize from pyspark.sql.types import StringType, StructType, ArrayType, StructField, Row from collections import OrderedDict import nltk nltk.download() spark = SparkSession \ .builder \ .appName("Python NLTK example") \ .getOrCreate() classifier_path = SparkFiles.get("english.all.3class.distsim.crf.ser.gz") ner_jar = SparkFiles.get("stanford-ner.jar") st = StanfordNERTagger(classifier_path, ner_jar, encoding='utf-8') def classify_text(row): text = row["body_t"] # Collapse array to single value field all_text = str.join(' ', text) tokenized_text = word_tokenize(all_text) classified_text = st.tag(tokenized_text) classifiers_dict = dict() for word, clz in classified_text: if clz in classifiers_dict: classifiers_dict[clz].add(word) else:
from pyspark.files import SparkFiles if __name__ == '__main__': ''' Usage: xnor_spark ''' spark = SparkSession\ .builder\ .appName('xnor_spark')\ .getOrCreate() spark.sparkContext.setLogLevel('ERROR') serialized = example.Serialized() print(SparkFiles.get('key.pub')) example.ReadSerializationFromFile(SparkFiles.get('key.pub'), serialized) m = hashlib.sha1() pubKey = example.SerializationToString(serialized, '')[1] m.update(pubKey.encode('utf-8')) dirName = m.hexdigest()[:3] print(dirName) # The "database" start = time.time() print('Reading files...') df = spark.read.text(glob.glob('./'+dirName+'/server/*.enc')) end = time.time() print('...took', end - start) df.show() print(df.rdd.getNumPartitions(), "partitions")