Python SparkFiles.get示例，pyspark.files.SparkFiles.get Python示例

示例#1

0

显示文件

    def subQuery(ct):
        serialized = example.Serialized()
        example.ReadSerializationFromFile(SparkFiles.get('cryptocontext.cc'), serialized)
        cryptoContext = example.CryptoContextFactory.DeserializeAndCreateContext(serialized, False)

        example.ReadSerializationFromFile(SparkFiles.get('query.enc'), serialized)
        queryCT = cryptoContext.deserializeCiphertext(serialized)

        example.StringToSerialization(ct, serialized)
        dfCT = cryptoContext.deserializeCiphertext(serialized)
        
        result = cryptoContext.EvalSub(dfCT, queryCT)

        """
        # Verify that one of them subs to 0
        example.ReadSerializationFromFile(SparkFiles.get('key.sec'), serialized)
        secKey = cryptoContext.deserializeSecretKey(serialized)
        plaintextDec = example.IntPlaintextEncoding([])
        decryptResult = cryptoContext.Decrypt(secKey, [result], plaintextDec, True)
        plaintextDec = example.decodeInts(plaintextDec)
        plaintextDec = plaintextDec[:decryptResult.messageLength]
        print(all(v == 0 for v in plaintextDec))
        """

        result.Serialize(serialized)
        return example.SerializationToString(serialized, '')[1]

示例#2

0

显示文件

文件： tests.py 项目： baontq/incubator-spark

 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEquals("Hello World!\n", test_file.readline())

示例#3

0

显示文件

文件： tests.py 项目： lizhanyang505/spark

 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEquals("Hello World!\n", test_file.readline())

示例#4

0

显示文件

文件： _spark_utils.py 项目： szczeles/mlflow

    def get_or_extract(archive_path):
        """Given a path returned by add_local_model(), this method will return a tuple of
        (loaded_model, local_model_path).
        If this Python process ever loaded the model before, we will reuse that copy.
        """
        from pyspark.files import SparkFiles
        import zipfile

        if archive_path in _SparkDirectoryDistributor._extracted_dir_paths:
            return _SparkDirectoryDistributor._extracted_dir_paths[
                archive_path]

        # BUG: Despite the documentation of SparkContext.addFile() and SparkFiles.get() in Scala
        # and Python, it turns out that we actually need to use the basename as the input to
        # SparkFiles.get(), as opposed to the (absolute) path.
        archive_path_basename = os.path.basename(archive_path)
        local_path = SparkFiles.get(archive_path_basename)
        temp_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(local_path, "r")
        zip_ref.extractall(temp_dir)
        zip_ref.close()

        _SparkDirectoryDistributor._extracted_dir_paths[
            archive_path] = temp_dir
        return _SparkDirectoryDistributor._extracted_dir_paths[archive_path]

示例#5

0

显示文件

    def mult(ct1, ct2):
        serialized = example.Serialized()
        example.ReadSerializationFromFile(SparkFiles.get('cryptocontext.cc'), serialized)
        cryptoContext = example.CryptoContextFactory.DeserializeAndCreateContext(serialized, False)

        ct1 = ct1.value
        example.StringToSerialization(ct1, serialized)
        ct1 = cryptoContext.deserializeCiphertext(serialized)

        ct2 = ct2.value
        example.StringToSerialization(ct2, serialized)
        ct2 = cryptoContext.deserializeCiphertext(serialized)

        result = cryptoContext.EvalMult(ct1, ct2)

        """
        # Verify that one of them mults to 0
        example.ReadSerializationFromFile(SparkFiles.get('key.sec'), serialized)
        secKey = cryptoContext.deserializeSecretKey(serialized)
        plaintextDec = example.IntPlaintextEncoding([])
        decryptResult = cryptoContext.Decrypt(secKey, [result], plaintextDec, True)
        plaintextDec = example.decodeInts(plaintextDec)
        plaintextDec = plaintextDec[:decryptResult.messageLength]
        print(all(v == 0 for v in plaintextDec))
        """

        result.Serialize(serialized)
        return Row(value=example.SerializationToString(serialized, '')[1])

示例#6

0

显示文件

文件： predict.py 项目： zhenglm/dlflow

        def predict_map(rdd):
            from pyspark.files import SparkFiles

            config = bc_config.value
            fmap = bc_fmap.value
            static_dir = SparkFiles.get(bc_static_model_dir.value)
            ckpt_dir = SparkFiles.get("ckpt")

            from dlflow.mgr import Collector, model
            collect = Collector()
            collect(static_dir, "Static models")

            input_cls = model[config.MODEL.input_name]
            dataset = input_cls(fmap).rdd_inputs(rdd, config.MODEL.batch_size)

            model_cls = model[config.MODEL.model_name]
            model_ins = model_cls(fmap)
            model_ins.load_model(ckpt_dir)

            return model_ins.predict_act(dataset)

示例#7

0

显示文件

 def execute(self):
     """
     Overrides the execute method of the Reader class to read the file from Http/Https location.
     :return: pyspark.sql.DataFrame
                     Returns a Spark's DataFrame with the data from the specified file.
     """
     try:
         spark_context = self.spark.sparkContext
         spark_context.addFile(self.location)
         return self.spark.read.format(self.file_format) \
             .load(SparkFiles.get(self.location.split('/')[-1]))
     except AnalysisException as exp:
         raise

示例#8

0

显示文件

文件： spark_model_cache.py 项目： zhangf911/mlflow

    def get_or_load(archive_path):
        """Given a path returned by add_local_model(), this method will return the loaded model.
        If this Python process ever loaded the model before, we will reuse that copy.
        """
        if archive_path in SparkModelCache._models:
            SparkModelCache._cache_hits += 1
            return SparkModelCache._models[archive_path]

        local_path = SparkFiles.get(archive_path)
        temp_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(local_path, 'r')
        zip_ref.extractall(temp_dir)
        zip_ref.close()

        # We must rely on a supposed cyclic import here because we want this behavior
        # on the Spark Executors (i.e., don't try to pickle the load_model function).
        from mlflow.pyfunc import load_pyfunc  # pylint: disable=cyclic-import
        SparkModelCache._models[archive_path] = load_pyfunc(temp_dir)
        return SparkModelCache._models[archive_path]

示例#9

0

显示文件

文件： spark_model_cache.py 项目： anjosma/mlflow_experiments

    def get_or_load(archive_path):
        """Given a path returned by add_local_model(), this method will return the loaded model.
        If this Python process ever loaded the model before, we will reuse that copy.
        """
        if archive_path in SparkModelCache._models:
            SparkModelCache._cache_hits += 1
            return SparkModelCache._models[archive_path]

        # BUG: Despite the documentation of SparkContext.addFile() and SparkFiles.get() in Scala
        # and Python, it turns out that we actually need to use the basename as the input to
        # SparkFiles.get(), as opposed to the (absolute) path.
        archive_path_basename = os.path.basename(archive_path)
        local_path = SparkFiles.get(archive_path_basename)
        temp_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(local_path, 'r')
        zip_ref.extractall(temp_dir)
        zip_ref.close()

        # We must rely on a supposed cyclic import here because we want this behavior
        # on the Spark Executors (i.e., don't try to pickle the load_model function).
        from mlflow.pyfunc import load_pyfunc  # pylint: disable=cyclic-import
        SparkModelCache._models[archive_path] = load_pyfunc(temp_dir)
        return SparkModelCache._models[archive_path]

示例#10

0

显示文件

文件： formula_imager.py 项目： metaspace2020/metaspace

def get_file_path(name):
    return Path(SparkFiles.get(name))

示例#11

0

显示文件

文件： test_nltk_ner_rdd.py 项目： shadowridgedev/fusion-spark-job-workbench

from pyspark.sql import SparkSession
from pyspark.files import SparkFiles
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from pyspark.sql.types import StringType, StructType, ArrayType, StructField, Row
from collections import OrderedDict
import nltk

nltk.download()

spark = SparkSession \
    .builder \
    .appName("Python NLTK example") \
    .getOrCreate()

classifier_path = SparkFiles.get("english.all.3class.distsim.crf.ser.gz")
ner_jar = SparkFiles.get("stanford-ner.jar")
st = StanfordNERTagger(classifier_path, ner_jar, encoding='utf-8')


def classify_text(row):
    text = row["body_t"]
    # Collapse array to single value field
    all_text = str.join(' ', text)
    tokenized_text = word_tokenize(all_text)
    classified_text = st.tag(tokenized_text)
    classifiers_dict = dict()
    for word, clz in classified_text:
        if clz in classifiers_dict:
            classifiers_dict[clz].add(word)
        else:

示例#12

0

显示文件

from pyspark.files import SparkFiles

if __name__ == '__main__':
    '''
        Usage: xnor_spark
    '''

    spark = SparkSession\
            .builder\
            .appName('xnor_spark')\
            .getOrCreate()

    spark.sparkContext.setLogLevel('ERROR')

    serialized = example.Serialized()
    print(SparkFiles.get('key.pub'))
    example.ReadSerializationFromFile(SparkFiles.get('key.pub'), serialized)
    m = hashlib.sha1()
    pubKey = example.SerializationToString(serialized, '')[1]
    m.update(pubKey.encode('utf-8'))
    dirName = m.hexdigest()[:3]
    print(dirName)

    # The "database"
    start = time.time()
    print('Reading files...')
    df = spark.read.text(glob.glob('./'+dirName+'/server/*.enc'))
    end = time.time()
    print('...took', end - start)
    df.show()
    print(df.rdd.getNumPartitions(), "partitions")