예제 #1
0
파일: crfexec.py 프로젝트: cjsanjay/dig-crf
def crfexec(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None):
    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    crfExecutable = "/usr/local/bin/crf_test"
    crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model")

    rdd_pipeinput = sc.textFile(inputFilename)
    rdd_pipeinput.setName('rdd_pipeinput')
    # rdd_pipeinput.persist()

    # DON'T USE SparkFiles.get to fetch the crf_test or model
    # This only works with local Spark (--master local[*])
    if location == 'hdfs':
        cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd
    rdd_crf = rdd_pipeinput.pipe(cmd)
    
    rdd_final = rdd_crf
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)
예제 #2
0
def train_partition(idx, iterator):
    port = 50000 + idx % 256
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset-%02d.hdf5" % idx)

    flag_file = "flags/__BARISTA_READY__.%d" % port
    if os.path.isfile(flag_file):
        os.remove(flag_file)

    #  out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver,
                      "--dset-size", "30000",
                      "--initial-replay", "20000",
                      "--debug",
                      "--overwrite",
                      "--port", str(port)])

    while not os.path.isfile(flag_file):
        pass

    for step in iterator:
        dc = DummyClient("127.0.0.1", port)
        dc.send(barista.GRAD_UPDATE)
        response = dc.recv()
        yield response
예제 #3
0
    def predict(self, X):
        """ Assumes X is an RDD or a list of (data, label) minibatch tuples."""

        if isinstance(X, RDD):
            # Distribute files
            X.context.addFile(self._solver_filename)
            X.context.addFile(self._architecture_filename)
            X.mapPartitions(self.predict)

        solver_filename = \
            SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
        architecture_filename = \
            SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])

        # Might need to modify path to architecture file inside solver file.
        # Maybe we should do this before shipping the file since all Spark
        # tmp directories will be identically named.

        net = SGDSolver(solver_filename).net

        for minibatch_data, minibatch_label in X:
            # TODO: update function call for latest Caffe
            net.set_input_arrays(minibatch_data,
                                 minibatch_label,
                                 self.input_index)
            output = net.forward(end=self.score_blob)
            scores = output[self.score_blob]
            pred = np.argmax(scores, axis=1).squeeze()
            yield pred
예제 #4
0
    def ship_prototxt_to_data(self, rdd):
        rdd.context.addFile(self._solver_filename)
        rdd.context.addFile(self._architecture_filename)
        solver_filename = \
            SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1])
        architecture_filename = \
            SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1])

        return solver_filename, architecture_filename
예제 #5
0
		def compute_buried_area(pdb_complex):
			chZ = "chZ"
			
			sasa_complex = -1.0
			sasa_rec = -1.0
			sasa_lig = -1.0
			buried_total = -1.0

			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")		
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
			
			f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
			f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")			
			f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")						
						
			# Makes the index file with the ligand (chain z) and the rest (non chain z)
			script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx	
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()	
					
			command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

			# Makes f_temp_sasa_rec file 
			script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
			command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()	

			command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " +  f_temp_sasa_lig
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

			sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
			sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
			sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)

			buried_total = sasa_rec + sasa_lig - sasa_complex

			#Generating result - See column sorting because resultaed file will be created based on this sorting
			returned_list = (base_name, buried_total)

			#Deleting files
			os.remove(f_ndx)
			os.remove(f_temp_sasa_complex)
			os.remove(f_temp_sasa_rec)
			os.remove(f_temp_sasa_lig)			

			return returned_list			
예제 #6
0
 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEqual("Hello World!\n", test_file.readline())
예제 #7
0
def load_timestep(timestep):
    path = data_path
    if download or config.copy_local:
        path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc')
    data = Dataset(path)
    pr = data.variables['pr']
    step = pr[timestep]
    # Return valid values
    return (timestep, step[~step.mask])
예제 #8
0
def spawn_barista(partition):
    main = SparkFiles.get("main.py")
    architecture = SparkFiles.get("train_val.prototxt")
    model = SparkFiles.get("deepq16.caffemodel")
    solver = SparkFiles.get("solver.prototxt")
    root = SparkFiles.getRootDirectory()
    dset = os.path.join(root, "dset.hdf5")
    flag_file = "flags/__BARISTA_READY__"
    if os.path.isfile(flag_file):
        os.remove("flags/__BARISTA_READY__")

    out = open(os.path.join(root, "barista.log"), 'w')
    subprocess.Popen(["python", main, architecture, model,
                      "--dataset", dset,
                      "--solver", solver],
                     stdout=out,
                     stderr=subprocess.STDOUT)

    while not os.path.isfile("flags/__BARISTA_READY__"):
        pass
def partitionIp2city(iter):
    from geoip2 import database

    def ip2city(ip):
        try:
           city = reader.city(ip).city.name
        except:
            city = 'not found'
        return city

    reader = database.Reader(SparkFiles.get(geoDBpath))
    #return [ip2city(ip) for ip in iter]
    return ip2city(iter)
def main(sc):
  sqlContext = SQLContext(sc)
  df = sqlContext.jsonFile(DATA_PATH)
  #add the filter file
  sc.addFile(FILTER_TERMS_FILE_PATH)
  filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt"))
  global filter_terms_set_bc
  filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect()))
  # Register the DataFrame as a table.
  df.registerTempTable("tweet")
  results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'")
  #filter tweets to find health related tweets
  filter_health_tweets = results.rdd.filter(healthFilter)
  filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
		def compute_buried_area_ligand(pdb_complex):
			chZ = "chZ"
			buried_lig_rec_perc = -1.0
			buried_lig_rec = -1.0
			buried_lig_lig = -1.0
			buried_lig_lig_perc = -1.0
			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)						
			pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")			
			#ndx files					
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")			
			#xvg files
			xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
			xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
			xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+  xvg_temp_sasa_lig_pose + " "+ str(probe.value)  + " "+ str(ndots.value)  + " "+  xvg_temp_sasa_lig_complex  + " "+ pdb_before_vs  + " "+  xvg_temp_sasa_lig_min
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()			
			try:
				# SASA of the isolated ligand in the pose conformation			
				sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
				# SASA of the complexed ligand in the pose conformation
				sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
				# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
				sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
				# Area of the ligand which is buried in the receptor
				buried_lig_rec = sasa_lig_pose - sasa_lig_complex
				buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
				# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
				buried_lig_lig = sasa_lig_min - sasa_lig_pose
				buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
				returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)

				#Deleting files
				os.remove(f_ndx)			
				os.remove(xvg_temp_sasa_lig_pose)
				os.remove(xvg_temp_sasa_lig_complex)
				os.remove(xvg_temp_sasa_lig_min)

				return returned_list
			except:
				return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
        def partition_processor(partitionlinechunks):
            """
            Partition logic for pyspark parallel processing
            """

            model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf"))

            def set_predictions(x):
                segment = model_pipe_object.predict_proba(x)
                return segment

            df_with_nan = build_dataframe(partitionlinechunks)
            df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan)
            behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan)
            predictions_ser = set_predictions(behaviour_df)

            predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]]
            return iter(predictions_list)
def load_matrix(
        filename,
        sc,
        num_users=NUM_USER,
        num_items=NUM_SONG
):
    global alpha
    global total
    global num_zeros

    print 'Start to load matrix...'

    t0 = time.time()
    counts = np.zeros((num_users, num_items))
    total = 0.0
    num_zeros = num_users * num_items

    url = "s3n://spark-mllib/fastcode/data/" + filename
    # url = "hdfs://localhost:9000/data/" + filename
    print 'loading... ' + url
    # data = sc.textFile(url)
    # data.map(lambda l: fill_maxtrix(l, counts))

    sc.addFile(url)
    with open(SparkFiles.get(filename)) as f:
        for line in f:
            fill_maxtrix(line, counts)

    alpha = num_zeros / total
    print 'alpha %.2f' % alpha
    counts *= alpha

    t1 = time.time()
    print 'Finished loading matrix in %f seconds\n' % (t1 - t0)
    print 'Total entry:', num_users * num_items
    print 'Non-zeros:', num_users * num_items - num_zeros

    counts = sparse.csr_matrix(counts)

    return counts, num_users * num_items - num_zeros
예제 #14
0
    return paras


if __name__ == "__main__":
    """
        Usage: test
    """
    # set the jsonString array of samples
    arr = ccm([20, 40], [1, 2], [1, 2], 250)
    # for local
    # scriptPath = "/Users/alexpb/Desktop/Lab/PySparkCUDAC/build/testPySpark"
    # for google cloud platform
    scriptPath = '/data/testPySpark'

    # os.system("chmod u+x %s" % scriptPath)
    if os.path.isfile(scriptPath):
        spark = SparkSession.builder.appName("PySparkCCM").getOrCreate()
        sc = spark.sparkContext
        sc.addFile(scriptPath, True)
        dataRDD = sc.parallelize(arr)
        pipeRDD = dataRDD.pipe(SparkFiles.get(scriptPath))
        for x in pipeRDD.collect():
            try:
                jsonstr = json.loads(x)
                result = jsonstr['result']
                print(sum(result) / float(len(result)))
                # print(jsonstr['result'])
            except ValueError, e:
                print("not valid result received")
        spark.stop()
예제 #15
0
		def compute_buried_area(pdb_complex):
			chZ = "chZ"

			sasa_complex = -1.0
			sasa_rec = -1.0
			sasa_lig = -1.0
			buried_total = -1.0

			returned_list = []

			try:
				base_name = get_name_model_pdb(pdb_complex)
				ligand_name = get_ligand_from_receptor_ligand_model(base_name)
				f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
				f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")

				f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
				f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")
				f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")

				# Makes the index file with the ligand (chain z) and the rest (non chain z)
				script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
				command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				# Makes f_temp_sasa_rec file
				script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
				command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " +  f_temp_sasa_lig
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
				sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
				sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)

				buried_total = sasa_rec + sasa_lig - sasa_complex

				#Generating result - See column sorting because resultaed file will be created based on this sorting
				returned_list = (base_name, buried_total)

			except:
				returned_list = (base_name, float(0))

			#Deleting files
			if os.path.exists(f_ndx):
				os.remove(f_ndx)
			if os.path.exists(f_temp_sasa_complex):
				os.remove(f_temp_sasa_complex)
			if os.path.exists(f_temp_sasa_rec):
				os.remove(f_temp_sasa_rec)
			if os.path.exists(f_temp_sasa_lig):
				os.remove(f_temp_sasa_lig)

			return returned_list
예제 #16
0
BLACKLISTED_IP_FILENAME = 'greensnow.txt'

THREAT_DB_PATH = os.path.join(BASE_PATH, '../threatdb')

DUMPS_PATH = os.path.join(BASE_PATH, '../dumps')

# SPARK CONFIG
#SPARK = SparkSession.builder.master(SPARK_MASTER_URL).appName(SPARK_APP_NAME).config('spark.driver.memory','10g').config('spark.executer.memory','10g').config('spark.memory.fraction','0.6').config('spark.executor.JavaOptions', '-XX:+UseG1GC').config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer').getOrCreate()


SPARK = SparkSession.builder.master(
    SPARK_MASTER_URL
).appName(
    SPARK_APP_NAME
).getOrCreate()



COUNTRY_DB_FILEPATH = os.path.join(BASE_PATH, IP_DB_FILENAME)
SPARK.sparkContext.addFile(COUNTRY_DB_FILEPATH)
COUNTRY_DB_FILEPATH = SparkFiles.get(IP_DB_FILENAME)
# INIT DIRECTORIESd
create_directory(DUMPS_PATH)
create_directory(TRAFFIC_LOGS_INPUT_DIR)
# create_directory(TRAFFIC_LOGS_OUTPUT_DIR)
create_directory(TENANT_PROFILE_OUTPUT_DIR)
create_directory(TENANT_MODEL_OUTPUT_DIR)
create_directory(ANOMALY_LOGS_OUTPUT_DIR)
create_directory(LOG_PATH)
create_directory(GRANULARIZED_LOG_PATH)
예제 #17
0
def crfprocess(sc, input, output, 
               # specify uriClass=None to mean default it from inputType
               # specify uriClass=False to mean there is no uriClass filtering
               # specify uriClass=class name (e.g., 'Offer') to fully specify it
               uriClass=None,
               featureListFilename=configPath('features.hair-eye'),
               modelFilename=configPath('dig-hair-eye-train.model'),
               jaccardSpecs=[],
               imageTrainingOutput=False,
               # minimum initial number of partitions
               numPartitions=None, 
               # number of documents to send to CRF in one call
               chunksPerPartition=100,
               # coalesce/down partition to this number after CRF
               coalescePartitions=None,
               # inputType
               inputType=DIG_WEBPAGE,
               outputFormat="text",
               limit=None, 
               sampleSeed=1234,
               debug=0, 
               location='hdfs'):

    show = True if debug>=1 else False
    def showPartitioning(rdd):
        """Seems to be significantly more expensive on cluster than locally"""
        if show:
            partitionCount = rdd.getNumPartitions()
            try:
                valueCount = rdd.countApprox(1000, confidence=0.50)
            except:
                valueCount = -1
            print "At %s, there are %d partitions with on average %s values" % (rdd.name(), partitionCount, int(valueCount/float(partitionCount)))
            if valueCount == 0:
                showSizeAndExit(rdd)

    debugOutput = output + '_debug'
    def debugDump(rdd,keys=True,listElements=False):
        showPartitioning(rdd)
        keys=False
        if debug >= 2:
            startTime = time.time()
            outdir = os.path.join(debugOutput, rdd.name() or "anonymous-%d" % randint(10000,99999))
            keyCount = None
            try:
                keyCount = rdd.keys().count() if keys else None
            except:
                pass
            rowCount = None
            try:
                rowCount = rdd.count()
            except:
                pass
            elementCount = None
            try:
                elementCount = rdd.mapValues(lambda x: len(x) if isinstance(x, (list, tuple)) else 0).values().sum() if listElements else None
            except:
                pass
            rdd.saveAsTextFile(outdir)
            endTime = time.time()
            elapsedTime = endTime - startTime
            print "wrote [%s] to outdir %r: [%s, %s, %s]" % (str(timedelta(seconds=elapsedTime)), outdir, keyCount, rowCount, elementCount)

    def showSizeAndExit(rdd):
        try:
            k = rdd.count()
        except:
            k = None
        print "Just finished %s with size %s" % (rdd.name(), k)
        exit(0)

    crfFeatureListFilename = featureListFilename
    crfModelFilename = modelFilename
    crfExecutable = binPath("crf_test_filter.sh")
    crfExecutable = binPath("crf_test_filter_lines.sh")
    crfExecutable = "apply_crf_lines.py"
    sc.addFile(crfExecutable)
    sc.addFile(crfModelFilename)

    # LOADING DATA
    if numPartitions:
        rdd_ingest = sc.sequenceFile(input, minSplits=numPartitions)
    else:
        rdd_ingest = sc.sequenceFile(input)
    rdd_ingest.setName('rdd_ingest_input')
    showPartitioning(rdd_ingest)

    # LIMIT/SAMPLE (OPTIONAL)
    if limit==0:
        limit = None
    if limit:
        # Because take/takeSample collects back to master, can create "task too large" condition
        # rdd_ingest = sc.parallelize(rdd_ingest.take(limit))
        # Instead, generate approximately 'limit' rows
        ratio = float(limit) / rdd_ingest.count()
        rdd_ingest = rdd_ingest.sample(False, ratio, seed=sampleSeed)
        
    # FILTER TO KNOWN INTERESTING URLS (DEBUG, OPTIONAL)
    # For debugging, allow inclusion/exclusion of items with known behavior
    # If set, only those URIs so listed are used, everything else is rejected
    keepUris = []
    # contains both hair and eyes
    # keepUris.append('http://dig.isi.edu/ht/data/page/2384EBCB1DD4FCA505DD05AB15F386547D05B295/1429603739000/processed')
    # contains both hair and eyes
    # keepUris.append('http://dig.isi.edu/ht/data/page/18507EEC7DD0A94A3A00F46D8B976CDFDD258723/1429603859000/processed')
    # contains both hair and eyeys
    # keepUris.append('http://dig.isi.edu/ht/data/page/442EA3A8B9FF69D65BC8B0D205C8C85204A7C799/1433150174000/processed')
    # for testing 'curly hair'
    # keepUris.append('http://dig.isi.edu/ht/data/page/681A3E68456987B1EE11616280DC1DBBA5A6B754/1429606198000/processed')
    if keepUris:
        rdd_ingest = rdd_ingest.filter(lambda (k,v): k in keepUris)
    # layout: pageUri -> content serialized JSON string
    rdd_ingest.setName('rdd_ingest_net')
    debugDump(rdd_ingest)

    # layout: pageUri -> dict (from json)
    rdd_json = rdd_ingest.mapValues(lambda x: json.loads(x))
    rdd_json.setName('rdd_json')
    debugDump(rdd_json)

    # RETAIN ONLY THOSE MATCHING URI CLASS
    if uriClass==None:
        if inputType==DIG_WEBPAGE:
            uriClass='WebPage'
        elif inputType==DIG_OFFER:
            uriClass='Offer'
        else:
            raise('Unknown inputType')
    if uriClass:
        # print("Filtering on uriClass={}".format(uriClass))
        rdd_relevant = rdd_json.filter(lambda (k,j): j.get("a", None)==uriClass)
    else:
        rdd_relevant = rdd_json
    rdd_relevant.setName('rdd_relevant')
    debugDump(rdd_relevant)

    def getIdentifiers(v):
        if inputType == DIG_OFFER:
            # We are looking for a toplevel attribute 'identifier'
            identifiers = v.get("identifier", None)
        elif inputType == DIG_WEBPAGE:
            # We are looking for the name field of the top level 'identifier' whose
            # "hasType" is "http://dig.isi.edu/thesaurus/identifier/ad-id"
            identifiers = []
            for idObj in v.get("identifier", []):
                if idObj.get("hasType") == "http://dig.isi.edu/thesaurus/identifier/ad-id":
                    name = idObj.get("name", None)
                    if name:
                        identifiers.extend(name)
        if identifiers:
            return asList(identifiers)
        else:
            return None

    def byIdentifier(k,v):
        result = []
        identifiers = getIdentifiers(v) or ["missing"]
        for i in identifiers:
            result.append( (k + "-" + str(i), v) )
        return result

    # Add identifier to URI
    if imageTrainingOutput:
        rdd_altered = rdd_relevant.flatMap(lambda (uri, j): byIdentifier(uri, j))
    else:
        rdd_altered = rdd_relevant
    rdd_altered.setName('rdd_altered')
    debugDump(rdd_altered)

    # print "### Processing %d input pages, initially into %s partitions" % (rdd_partitioned.count(), rdd_partitioned.getNumPartitions())
    # layout: pageUri -> (body tokens, title tokens)
    rdd_texts = rdd_altered.mapValues(lambda x: (textTokens(extract_body(x, inputType=inputType)), 
                                                 textTokens(extract_title(x, inputType=inputType))))
    rdd_texts.setName('rdd_texts')
    debugDump(rdd_texts)

    # We use the following encoding for values for CRF++'s so-called
    # labels to reduce the data size and complexity of referring to
    # words.  Each word is assigned a URI constructed from the page
    # URI (Karma URI) plus a 5 digit zero-padded number for the
    # subdocument plus a 5 digit zero-padded number for the word
    # index (1-based).  By "subdocument" we mean the page body and
    # page title (for HT; there could be others in other domains).
    # Additionally, an artificial "separator" document is used to
    # generate a barrier to avoid inadvertent capture of spurious
    # spans between subdocuments.
    #
    # Example: the first word of the body of
    # http://dig.isi.edu/ht/data/page/0434CB3BDFE3839D6CAC6DBE0EBED0278D3634D8/1433149274000/processed
    # would be http://dig.isi.edu/ht/data/page/0434CB3BDFE3839D6CAC6DBE0EBED0278D3634D8/1433149274000/processed/00000/00001

    SEPARATOR = ' '
    BODY_SUBDOCUMENT = 0
    SEPARATOR_SUBDOCUMENT = 1
    TITLE_SUBDOCUMENT = 2
    c = crf_features.CrfFeatures(crfFeatureListFilename)

    def makeMatrix(c, uri, bodyTokens, titleTokens):
        b = c.computeFeatMatrix(bodyTokens, False, addLabels=False, addIndex=False)
        s = c.computeFeatMatrix([SEPARATOR, ""], False, addLabels=False, addIndex=False)
        t = c.computeFeatMatrix(titleTokens, False, addLabels=False, addIndex=False)
        # BODY
        idx = 1
        for row in b:
            if row == u"":
                pass
            else:
                label = uri + "/%05d/%05d" % (BODY_SUBDOCUMENT, idx)
                row.append(label)
                idx += 1
        # SEPARATOR pseudo document
        idx = 1
        for row in s:
            if row == u"":
                pass
            else:
                label = uri + "/%05d/%05d" % (SEPARATOR_SUBDOCUMENT, idx)
                row.append(label)
                idx += 1
        # TITLE
        idx = 1
        for row in t:
            if row == u"":
                pass
            else:
                label = uri + "/%05d/%05d" % (TITLE_SUBDOCUMENT, idx)
                row.append(label)
                idx += 1
        # Keep the empty string semaphor from the title (last
        # component) for CRF++ purposes
        return b[0:-1] + s[0:-1] + t

    # page feature matrix including body, separator, title
    # (vector of vectors, includes separator rows)
    # layout: pageUri -> (python) vector of vectors
    rdd_features = rdd_texts.map(lambda (k,v): (k, makeMatrix(c, k, v[0], v[1])))
    rdd_features.setName('rdd_features')
    debugDump(rdd_features)

    # unicode UTF-8 representation of the feature matrix
    # layout: pageUri -> unicode UTF-8 representation of the feature matrix
    rdd_vector = rdd_features.mapValues(lambda x: vectorToUTF8(x))
    rdd_vector.setName('rdd_vector')
    debugDump(rdd_vector)

    # Disregard keys/partitioning considerations here
    # Drop keys put serialized vectors into lists of size chunksPerPartition, dropping any nulls, then concatenate

    # layout: lists of size up to chunksPerPartition of UTF8(feature vectors)
    rdd_chunked = rdd_vector.values().glom().map(lambda l: [filter(lambda e: e, x) for x in iterChunks(l, chunksPerPartition)]).map(lambda l: ["".join(x) for x in l])
    rdd_chunked.setName('rdd_chunked')
    debugDump(rdd_chunked, keys=False)

    rdd_pipeinput = rdd_chunked.flatMap(lambda x: x).map(lambda r: b64encode(r))
    rdd_pipeinput.setName('rdd_pipeinput')
    debugDump(rdd_pipeinput, keys=False)

    # base64 encoded result of running crf_test and filtering to
    # include only word, wordUri, non-null label
    # local
    executable = SparkFiles.get(os.path.basename(crfExecutable)) if location=="local" else os.path.basename(crfExecutable)
    # local
    model = SparkFiles.get(os.path.basename(crfModelFilename)) if location=="local" else os.path.basename(crfModelFilename)
    cmd = "%s %s" % (executable, model)
    print "### Pipe cmd is %r" % cmd

    rdd_pipeoutput = rdd_pipeinput.pipe(cmd)
    if coalescePartitions:
        rdd_pipeoutput = rdd_pipeoutput.coalesce(max(2, coalescePartitions))
    rdd_pipeoutput.setName('rdd_pipeoutput')
    debugDump(rdd_pipeoutput)

    # base64 decoded to regular serialized string
    # beware newlines corresponding to empty CRr++ crf_test output 
    # There may be a need to utf8 decode this data upon reacquisition, but believed not
    rdd_base64decode = rdd_pipeoutput.map(lambda x: b64decode(x))
    rdd_base64decode.setName('rdd_base64decode')
    debugDump(rdd_base64decode)

    def reorg(tabsep):
        (word, uri, label) = tabsep.split('\t')
        return (uri, (word, label))

    # 1. break into physical lines
    # 2. drop any inter-document empty string markers
    # 3. destructure each line into its own word, wordUri, label row
    # wordUri -> (word,label)
    rdd_tabular = rdd_base64decode.map(lambda b: b.split('\n')).flatMap(lambda x: x).filter(lambda x: x).map(lambda l: reorg(l))
    rdd_tabular.setName('rdd_tabular')
    debugDump(rdd_tabular)

    def organizeByOrigDoc(uri, word, label):
        (parentUri, docId, wordId) = uri.rsplit('/', 2)
        return ( (parentUri, docId), (wordId, word, label) )

    # composite key (docUri, subdocId) -> (wordId, word, label)
    rdd_reorg = rdd_tabular.map(lambda (uri,tpl): organizeByOrigDoc(uri, tpl[0], tpl[1]))
    rdd_reorg.setName('rdd_reorg')
    debugDump(rdd_reorg)

    def seqFunc(s,c):
        s.add(c)
        return s

    def combFunc(s1, s2):
        s1.update(s2)
        return s1

    # composite key (docUri, subdocId) -> set of (wordId, word, label)
    rdd_agg = rdd_reorg.aggregateByKey(set(),
                                       lambda s,c: seqFunc(s,c),
                                       lambda s1,s2: combFunc(s1,s2))
    rdd_agg.setName('rdd_agg')
    debugDump(rdd_agg)

    # (docUri, subDocId) -> sorted list of (wordId, word, label)
    rdd_grouped = rdd_agg.mapValues(lambda s: sorted(s))
    rdd_grouped.setName('rdd_grouped')
    debugDump(rdd_grouped)

    def harvest(seq):
        allSpans = []
        lastIndex = -2
        lastLabel = None
        currentSpan = []
        for (wordId, word, label) in seq:
            currentIndex = int(wordId)
            if lastIndex+1 == currentIndex and lastLabel == label:
                # continuing current span
                currentSpan.append( (currentIndex, word, label) )
            else:
                # end current span
                if currentSpan:
                    allSpans.append(currentSpan)
                # begin new span
                currentSpan = [ (currentIndex, word, label) ]
                lastLabel = label
            lastIndex = currentIndex

        # end current span
        if currentSpan:
            allSpans.append(currentSpan)
        
        result = []
        for span in allSpans:
            words = []
            spanLabel = None
            for (wordIdx, word, label) in span:
                spanLabel = label
                words.append(word)
            result.append( (' '.join(words), spanLabel) )
        return result
            
    # ( (parentUri, docId), [ (words1, category1), (words2, category2), ... ]
    rdd_harvest = rdd_grouped.mapValues(lambda s: harvest(s))
    rdd_harvest.setName('rdd_harvest')
    debugDump(rdd_harvest)

    # parentUri -> (words, category)
    # we use .distinct() because (e.g.) both title and body might mention the same feature
    rdd_flat = rdd_harvest.map(lambda r: (r[0][0], r[1])).flatMapValues(lambda x: x).distinct()
    rdd_flat.setName('rdd_flat')
    debugDump(rdd_flat)

    # We map from CRF output (category) to (potentially multiple) HJ handle(s)
    hjHandlers = defaultdict(list)
    for (category,digFeature,config,reference) in jaccardSpecs:
        # add one handler
        if config and reference:
            hjHandlers[category].append({"category": category,
                                         "featureName": digFeature,
                                         "hybridJaccardInterpreter": HybridJaccard(config_path=config, ref_path=reference).findBestMatch})
        else:
            hjHandlers[category].append({"category": category,
                                         "featureName": digFeature,
                                         "hybridJaccardInterpreter": None})

    def jaccard(tpl):
        results = []
        (words, category) = tpl
        for handler in hjHandlers[category]:
            interpreter = handler["hybridJaccardInterpreter"]
            results.append({"featureName": handler["featureName"],
                            "featureValue": interpreter(words) if interpreter else words,
                            # for debugging
                            "crfCategory": category,
                            "crfWordSpan": words,
                            # intended to support parametrization/provenance
                            "featureDefinitionFile": os.path.basename(crfFeatureListFilename),
                            "featureCrfModelFile": os.path.basename(crfModelFilename)})
        return results

    # there could be more than one interpretation, e.g. hairColor + hairType for a given CRF category
    # use flatMapValues to iterate over all
    rdd_jaccard = rdd_flat.flatMapValues(lambda v: jaccard(v))
    rdd_jaccard.setName('rdd_jaccard')
    debugDump(rdd_jaccard)

    def extendDict(d, key, value):
        d[key] = value
        return d

    # add in the URI for karma modeling purposes
    rdd_aligned = rdd_jaccard.map(lambda (uri,v): (uri, extendDict(v, "uri", uri)))
    rdd_aligned.setName('rdd_aligned')
    debugDump(rdd_aligned)

    # rdd_aligned = rdd_pipeinput
    # docUri -> json

    def recoverIdentifier(k):
        return k.rsplit('-',1)[-1]

    if imageTrainingOutput:
        rdd_final = rdd_aligned.map(lambda (k,v): (recoverIdentifier(k), (v.get("featureName"),
                                                                          v.get("featureValue")))).filter(lambda (k,p): p[1]!='NONE')
    else:
        rdd_final = rdd_aligned.mapValues(lambda v: json.dumps(v))
    rdd_final.setName('rdd_final')
    debugDump(rdd_final)

    if rdd_final.isEmpty():
        print "### NO DATA TO WRITE"
    else:
        if outputFormat == "sequence":
            rdd_final.saveAsSequenceFile(output)
        elif outputFormat == "text":
            rdd_final.saveAsTextFile(output)
        elif outputFormat == "tsv":
            rdd_tsv = rdd_final.map(lambda (k,p): k + "\t" + p[0] + "\t" + p[1])
            rdd_tsv.saveAsTextFile(output)
        else:
            raise RuntimeError("Unrecognized output format: %s" % outputFormat)
예제 #18
0
 def upload_file_job(context):
     from pyspark import SparkFiles
     with open(SparkFiles.get(upload_file_name)) as testFile:
         file_val = testFile.readline()
     return file_val
예제 #19
0
 def sparkFilePathMapper(self, path):
     """When Spark forwards files from the driver to worker nodes, it may be
     necessary to map the filename path on a per-worker node basis."""
     # Note the implication in this code that the feature list file and
     # model file must have unique basenames.
     return SparkFiles.get(os.path.basename(path))
예제 #20
0
    def update_data(self):
        self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape)
        self.arrays['label'][:] = np.random.choice(
                                      xrange(10),
                                      size=self.arrays['label'].shape)

    def process_model(self):
        pass

# Create some dummy data
dataRDD = sc.parallelize(xrange(100))

# Create some barista instances
num_baristas = 2
start_script = 'python -m barista.start'
solver = SparkFiles.get("solver.prototxt")
interfaces = sc.parallelize([solver]*num_baristas, num_baristas) \
                  .pipe(start_script) \
                  .collect()


# Join the data
def train(interface, data):
    solver_filename, pid = interface.split(',')
    customer = MyCustomer(solver_filename)
    customer.run_transaction()
    grad_norm = np.linalg.norm(customer.arrays['conv1_dW'])
    return grad_norm

grad_norms = dataRDD.map(lambda x: train(interfaces[0], x)).collect()
print grad_norms
예제 #21
0
def find_neighbors(i):
    from annoy import AnnoyIndex
    ai = AnnoyIndex(f)
    ai.load(SparkFiles.get("index.ann"))
    return (ai.get_nns_by_vector(vector=x[1], n=5) for x in i)
예제 #22
0
    rdd_json = rdd.mapValues(lambda x: json.loads(x))

    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))

    # TBD
    # rdd_title = rdd_json.mapValues(lambda x: extract_title(x))
    # rdd_title_tokens = rdd.title.mapValues(lambda x: textTokens(x))
    # all below should also be done for title

    # not a pair RDD?
    rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True)))
    rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x))

    cmd = SparkFiles.get("crf_test") + " -m " + SparkFiles.get(crfModelFilename)
    rdd_crf = rdd_pipeinput.values().pipe(cmd)
    # not a pair RDD
    # but we have the URI in the -3 position
    # and the index in the -2 position
    rdd_withuri = rdd_crf.map(lambda x: reconstructTuple(x))

    rdd_grouped = rdd_withuri.groupByKey()
    rdd_flat = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))])
    rdd_harvested = rdd_flat.mapValues(lambda x: computeSpans(x, indexed=True))

    # This has the effect of generating 0, 1, 2, ... lines according to the number of spans
    rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x))
    # map any eyeColor spans using smEye, hairType spans using smHair
    rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEye, "hairType": smHair}))
예제 #23
0
import dash_html_components as html
from dash.dependencies import Input, Output
external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"]

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
#server=app.server
plt_io.templates["custom_dark"] = plt_io.templates["plotly_dark"]

spark = SparkSession.builder.master('local[*]').appName('ICULux').config("spark.files.overwrite", "true")\
    .config("spark.worker.cleanup.enabled","true").getOrCreate()
sc = spark.sparkContext
url = "https://physionet.org/files/mimicdb/1.0.0/055/05500001.txt"
sc.addFile(url)

# first get all lines from file
with open(SparkFiles.get("05500001.txt"), 'r') as f:
    lines = f.readlines()

# remove spaces
lines = [line.replace(' ', '') for line in lines]

# finally, write lines in the file
#with open("temp.txt", 'w') as f:
#    f.writelines(lines)

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("val1", StringType(), True),
    StructField("val2", StringType(), True),
    StructField("val3", StringType(), True)
])
sqlContext = SQLContext(sc)

#add the file to the spark context

#from pyspark.sql import SQLContext
url = "201029COVID19MEXICO.csv"
from pyspark import SparkFiles
sc.addFile(url)
sqlContext = SQLContext(sc)


# In[14]:


#read the csv and create a dataframe
df = sqlContext.read.csv(SparkFiles.get("201029COVID19MEXICO.csv"), header=True, inferSchema= True)


df.printSchema()

#group and print the count of type of patient
# 1 = outpatient, 2 = inpatient
df.groupBy("TIPO_PACIENTE").count().sort("count",ascending=True).show()


# In[15]:


import pyspark.sql.functions as F
#df1 = df.withColumnRenamed("TIPO_PACIENTE","Patient_Type")\.withColumnRenamed("SEXO","Sex")
예제 #25
0
    def __init__(self):
        """Create a spark context and session"""
        print('__init__ called')
        config = configparser.ConfigParser()
        print(
            '---------------------------------------------------Start-----------------------------------------------------------------------------------------------------'
        )
        sc = SparkContext.getOrCreate()
        print(sc.applicationId)
        print(sc.master)

        print('spark conf starts')
        spark = SparkSession(sc)
        for item in spark.sparkContext.getConf().getAll():
            print(item)
        print('spark conf ends')

        #        cwd = os.getcwd()
        #        arr = os.listdir(cwd)
        #        print('-------------------------------------------------------working directories-----------------------------------------------------------------------------------')
        #        print(cwd)
        #        print(arr)
        #        print('----------------------------------------------------------cmd starts-----------------------------------------------------------------------------------------')
        ##        cmd = 'hdfs dfs -ls /home/hadoop'.split() # cmd must be an array of arguments
        ##        files = subprocess.check_output(cmd).strip().split('\n')
        ##        for path in files:
        ##          print (path)
        #        print('-------------------------------applicationId-------------------------------------------------------------------------------------------------------------------')
        #        appId = sc.applicationId
        #        print(appId)
        #        ip = 'ip-172-31-18-164.ec2.internal'
        #        appPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.json'
        #        iniPath = 'hdfs://' + ip + ':8020/user/hadoop/.sparkStaging/' + appId + '/config.ini'
        #        print(appPath)
        #        print(iniPath)
        #        tmp1 = 'file:///user/hadoop/.sparkStaging/' + appId + '/config.ini'
        #        print(tmp1)
        ##        print('-------------------------------------------------------------config reader starts-------------------------------------------------------------------------')
        ##        strng = open(appPath, 'r').read()
        ##        print(strng)
        #        print('----------------------------------open ends---------------------------------------------------------------------------------')
        #        print(SparkFiles.getRootDirectory())
        #        print(os.listdir(SparkFiles.getRootDirectory()))
        ##        cmd = 'hdfs dfs -ls ' + SparkFiles.getRootDirectory() +''.split()
        ##        files = subprocess.check_output('hdfs dfs -ls ' + SparkFiles.getRootDirectory()).strip().split('\n')
        ##        for path in files:
        ##          print (path)
        ##        stg_path = str(fs.defaultFS) + "/user/" + str(os.environ['USER']) + "/.sparkStaging/" + str(sc.applicationId) + "/" lines = sc.textFile(os.path.join(stg_path,'readme.txt'))
        ##        print(lines.collect())
        #        print('--------------------------------------------------getRootDirectory----------------------------------------------------------------------')
        #
        #
        ##        configFile = pkg_resources.resource_filename(pkg_resources.Requirement.parse("myapp"), "config.ini")
        ##        config = ConfigParser.ConfigParser()
        ##        config.read(configFile)

        #

        #        print('------------------INI file--------------------------------------------------------------------------------------------------------------------------')
        #        conString = ''
        #        inputFile = 'config.ini'
        #        with open(SparkFiles.get(inputFile)) as test_file:
        #            conString = test_file.read()
        #        print('----------------------------------------------------------------config reader ends---------------------------------------------------------------------')
        #        print(conString)
        #        print('print(conString) starts')
        #        config.read_string(conString.decode())
        #        print('print(conString) ends')
        #        print(config)
        #        val = config.get('SPARK', 'val')
        ##        val = config['SPARK']['val']
        #        print(val)
        #        print('----------------------------------------------------------val-----------------------------------------------------------------------------------------------')

        print(
            '------------------JSON file--------------------------------------------------------------------------------------------------------------------------'
        )
        inputFile = 'config.json'
        #        print(os.environ)
        #        print(os.environ['SPARK_YARN_STAGING_DIR'])
        #        print('os.environ completed')
        print(SparkFiles.getRootDirectory())
        ipath = os.path.join(SparkFiles.getRootDirectory() + '/' + inputFile)
        print(ipath)
        conString = ''

        print(
            '-----------------------------------------------------------printing a------------------------------------------------------------'
        )
        a = sc.textFile("file:///" + SparkFiles.get(inputFile)).collect()
        print(a)
        print(
            '-----------------------------------------------------------printing b------------------------------------------------------------'
        )
        b = sc.textFile("file:///" + ipath).collect()
        print(b)
        #

        with open(SparkFiles.get(inputFile)) as test_file:
            conString = test_file.read()
        print(
            '----------------------------------------------------------------config reader ends---------------------------------------------------------------------'
        )
        print(conString)
        print(
            '----------------------------------------------------------val-----------------------------------------------------------------------------------------------'
        )

        #hdfs://ip-172-31-19-25.ec2.internal:8020
        #        print('---------------------------------------------Hadoop Files------------------------------------------------------------------------------------------------------')
        ##        print(SparkFiles.getRootDirectory())

        #        print('------------------------------------------------------------Context------------------------------------------------------------------------------------------------')
        ##        textFile = sc.textFile(appPath)
        #        conf = spark.read.option("multiline",True).json(appPath)
        #        print('------------------------------------------------------------------------Spark Read---------------------------------------------------------------------------')
        #        print(conf.printSchema())
        #        print(conf.select('SPARK1').first()[0])
        #        print(sc.sparkUser())
        #        print('---------------------------------------------------------completed---------------------------------------------------------------------------------------')
        #        data = sc.parallelize(list(conf['SPARK']))
        data = sc.parallelize(list('HelloWorld12345'))
        data.map(lambda x: (x, 1)).reduceByKey(add).sortBy(
            lambda x: x[1], ascending=False).coalesce(1).saveAsTextFile(
                'tmp/result/' +
                str(ts))  #s3://nithin-emr/' + str(ts) + '/result'
        sc.stop()
예제 #26
0
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(
    "FakeNewsPoject_Naive_Bayes").getOrCreate()

from pyspark import SparkFiles
# Load in Fake.csv from S3 into a DataFrame
fake_url = "https://bootcamp-proj-3.s3.us-east-2.amazonaws.com/Fake.csv"
spark.sparkContext.addFile(fake_url)

raw_fake_df = spark.read.csv(SparkFiles.get("Fake.csv"), sep=",", header=True)
# raw_fake_df.show(10)

# Load in True.csv from S3 into a DataFrame
true_url = "https://bootcamp-proj-3.s3.us-east-2.amazonaws.com/True.csv"
spark.sparkContext.addFile(true_url)

raw_true_df = spark.read.csv(SparkFiles.get("True.csv"), sep=",", header=True)
# raw_true_df.show(10)

import pyspark.sql.functions as sf

# Add true/fake categories
add_category_fake = raw_fake_df.withColumn('category', sf.lit('Fake'))
add_category_true = raw_true_df.withColumn('category', sf.lit('True'))
예제 #27
0
파일: App.py 프로젝트: kostjaigin/bachelor
def apply_network(dataset: str, serialized):
    hyperparams_route = SparkFiles.get(f'{dataset}_hyper.pkl')
    model_route = SparkFiles.get(f'{dataset}_model.pth')
    predictor = Predictor(hyperparams_route, model_route)
    return predictor.predict(serialized)
예제 #28
0
파일: crfprep.py 프로젝트: cjsanjay/dig-crf
def crfprep(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None):
    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    featureListFilename = os.path.join(crfConfigDir, "features.hair-eye")
    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    crfExecutable = "/usr/local/bin/crf_test_filter.sh"
    crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model")

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    if limit:
        rdd_sequence_file_input = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)

    rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x))
    rdd_json.setName('rdd_json')
    # rdd_json.persist()

    rdd_texts = rdd_json.mapValues(lambda x: (textTokens(extract_body(x)), textTokens(extract_title(x))))
    rdd_texts.setName('rdd_texts')
    # data format issue?
    # rdd_texts.saveAsSequenceFile(outputDirectory + "_texts")

    # This separator could have appeared in original text, and should serve to cleanly delimit the body from the title
    # Not perfect, it could have appeared between real tokens

    # Needs to have single labels+index feature
    # former code was lost

    c = crf_features.CrfFeatures(featureListFilename)
    SEPARATOR = ' ',

    def makeMatrix(c, uri, bodyTokens, titleTokens):
        b = c.computeFeatMatrix(bodyTokens, False, addLabels=False, addIndex=False)
        s = c.computeFeatMatrix([SEPARATOR, ""], False, addLabels=False, addIndex=False)
        t = c.computeFeatMatrix(titleTokens, False, addLabels=False, addIndex=False)
        idx = 1
        for row in b:
            if row == u"":
                pass
            else:
                label = uri + "/%05d/%05d" % (0, idx)
                row.append(label)
                idx += 1
        idx = 1
        for row in s:
            if row == u"":
                pass
            else:
                label = uri + "/%05d/%05d" % (1, idx)
                row.append(label)
                idx += 1
        idx = 1
        for row in t:
            if row == u"":
                pass
            else:
                label = uri + "/%05d/%05d" % (2, idx)
                row.append(label)
                idx += 1
        # might be b[0:-1] + s[0:-1] + t?
        return b[0:-1] + s[0:-1] + t


    rdd_features = rdd_texts.map(lambda x: (x[0], makeMatrix(c, x[0], x[1][0], x[1][1])))
    rdd_features.setName('rdd_features')
    # rdd_features.persist()

    rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToUTF8(x)).values()
    rdd_pipeinput.setName('rdd_pipeinput')

    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "###CMD %s" % cmd
    rdd_crfoutput = rdd_pipeinput.pipe(cmd)
    rdd_crfoutput.setName('rdd_crfoutput')
    # rdd_features.persist()
    
    rdd_final = rdd_crfoutput
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)
from pyspark.sql import Row
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.ml import Pipeline
print('xxxxxxxxxxxxxxxxx -PIPELINE- xxxxxxxxxxxxxxx #2')

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, OneHotEncoderEstimator
from pyspark.sql.functions import col, countDistinct
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
print('xxxxxxxxxxxxxxxxx -STRINGIndexer- xxxxxxxxxx #3')

url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
Dataf = sqlContext.read.csv(SparkFiles.get("adult_data.csv"),
                            header=True,
                            inferSchema=True)

print(
    'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx HEADS xxxxxxxxxxxx '
)
windowX = Window.partitionBy(Dataf['income']).orderBy(Dataf['age'].desc())
Dataf.select(
    'income', 'gender', 'workclass', 'education', 'educational-num', 'income',
    rank().over(windowX).alias('pemba')).filter(col('pemba') <= 50000).show()
print(
    'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx GROUPLIST xxxxxxxx '
)
Dataf.printSchema()
print((Dataf.count(), len(Dataf.columns)))
		def compute_buried_area_all_residues_and_receptor_area(pdb_complex):
			chZ = "chZ"
			res_buried_area_perc = -1
			res_buried_area = -1
			buried_receptor_system = -1
			buried_receptor_res = -1
			base_name = get_name_model_pdb(pdb_complex)
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)

			#output area receptor file
			f_output_receptor_buried_area = os.path.join(path_analysis_pdb_complex_b.value,base_name+".outAreaRecep")
			#ndx files
			#f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
			f_ndx_temporary_index_z = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_index_z"+".ndx")
			f_ndx_temporary = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary"+".ndx")
			f_ndx_temporary_sasa = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa"+".ndx")

			#xvg files
			f_xvg_temporary_sasa_res_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res-lig"+".xvg")
			f_xvg_temporary_sasa_res  = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res"+".xvg")
			f_xvg_temporary_sasa_rec_lig  = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec_lig"+".xvg")
			f_xvg_temporary_sasa_rec  = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec"+".xvg")

			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_receptor = SparkFiles.get("make_ndx_buried_area_receptor.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_receptor + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_index_z + " "+ f_ndx_temporary
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()
			#coping file
			if os.path.exists(f_ndx_temporary):
				shutil.copy(f_ndx_temporary, f_ndx_temporary_sasa)
				#Get all residues for computing area receptor
				all_res = get_residues_receptor_from_ndx_files(f_ndx_temporary)
				returned_list = []
				for res in all_res:
					script_make_ndx_buried_area_receptor_res = SparkFiles.get("make_ndx_buried_area_receptor_res.sh") #Getting bash script that was copied by addFile command
					command = script_make_ndx_buried_area_receptor_res + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_sasa + " "+ str(res)
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()
					# compute surface of system - saved on xvg
					command = gromacs_path.value +"gmx sasa -surface complex -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()
					# compute surface of receptor - save on xvg
					command = gromacs_path.value +"gmx sasa -surface rec -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()
					#calculate area
					if os.path.exists(f_xvg_temporary_sasa_res_lig):
						buried_receptor_system = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res_lig)
					else:
						buried_receptor_system = 0
					if os.path.exists(f_xvg_temporary_sasa_res):
						buried_receptor_res  = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res)
					else:
						buried_receptor_res = 0
					res_buried_area = buried_receptor_res - buried_receptor_system
					if (res_buried_area > 0) and (buried_receptor_res > 0):
						res_buried_area_perc = res_buried_area/buried_receptor_res
						#Generating result
						result = (base_name, res, res_buried_area,  res_buried_area_perc)
						returned_list.append(result)
					#Deleting files
					if os.path.exists(f_xvg_temporary_sasa_res_lig):
						os.remove(f_xvg_temporary_sasa_res_lig)
					if os.path.exists(f_xvg_temporary_sasa_res):
						os.remove(f_xvg_temporary_sasa_res)

					#Computing Receptor Area
					command = gromacs_path.value +"gmx sasa -surface complex -output rec"+ " -o "+ f_xvg_temporary_sasa_rec_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()

					command = gromacs_path.value +"gmx sasa -surface rec -output rec"+ " -o "+ f_xvg_temporary_sasa_rec + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()

					if os.path.exists(f_xvg_temporary_sasa_rec_lig):
						sasa_rec_lig = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec_lig)
					else:
						sasa_rec_lig = 0

					if os.path.exists(f_xvg_temporary_sasa_rec):
						sasa_rec = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec)
					else:
						sasa_rec = 0

					receptor_area = sasa_rec - sasa_rec_lig

					#Saving result file
					output_receptor_buried_area = open(f_output_receptor_buried_area, "w")
					output_receptor_buried_area.write(str(base_name)+" "+str(receptor_area) +"\n")
					output_receptor_buried_area.close()

					#Deleting all files
					if os.path.exists(f_xvg_temporary_sasa_rec_lig):
						os.remove(f_xvg_temporary_sasa_rec_lig)
					if os.path.exists(f_xvg_temporary_sasa_rec):
						os.remove(f_xvg_temporary_sasa_rec)
					if os.path.exists(f_ndx_temporary):
						os.remove(f_ndx_temporary)
					if os.path.exists(f_ndx_temporary_sasa):
						os.remove(f_ndx_temporary_sasa)
					if os.path.exists(f_ndx_temporary_index_z):
						os.remove(f_ndx_temporary_index_z)

					return returned_list
			else:
				#Here means that some problem for computing area
				return (base_name, "NAN", float(0),  float(0))
예제 #31
0
# Can I perist a Caffe network object?
import copy
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles
from pyspark import StorageLevel

conf = SparkConf().setAppName("SparkCaffe Test")
conf.set("spark.executor.memory", "1g")
sc = SparkContext(conf=conf)

sc.addFile("models/solver.prototxt")
sc.addFile("models/train_val.prototxt")

solver = SparkFiles.get("solver.prototxt")
architecture = SparkFiles.get("train_val.prototxt")


def create_net(solver_filename):
    from caffe import SGDSolver

    net = SGDSolver(str(solver_filename)).net
    return net

netRDD = sc.parallelize([solver]*2, 2) \
             .map(create_net)

netRDD.persist(StorageLevel.MEMORY_ONLY)


def extract_unique_val(net):
    return net.params['conv1'][0].data[0, 0, 0, 0]
예제 #32
0
def main():
    conf = (SparkConf()
                .setMaster("local[*]")
                .setAppName("compare_engine"))
                
    sc = SparkContext(conf = conf)
    sc.setLogLevel('INFO')

    sc.addFile(primary)

    # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() 
    rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() 
    rdd_primary.partitionBy(10).cache()

    os.system('rm -Rf collects_*')
    os.system('rm -Rf holder.txt')
       
    rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct()
    rdd_secondary.partitionBy(10).cache()

    primary_count = rdd_primary.count()
    primary_report['count'] = primary_count
    print(primary_report)

    secondary_count = rdd_secondary.count()
    secondary_report['count'] = secondary_count
    print(secondary_report)

    # Return each Primary file line/record not contained in Secondary
    not_in_primary  = rdd_primary.subtract(rdd_secondary)
    primary_diff = not_in_primary.count()
    primary_report['diff'] = primary_diff
    
    os.system('rm -Rf collects_*.csv')

    primary_dir = 'collects_{}_primary'.format(run_date)
    primary_report_name = 'collects_{}_primary_report.csv'.format(run_date)

    not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir)

    # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date))
    os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name))
    os.system('wc -l collects_{}_primary_report.csv'.format(run_date))

    # Flip Primary Vs Secondary
    # Return each Secondary file line/record not contained in Primary
    not_in_secondary  = rdd_secondary.subtract(rdd_primary)
    secondary_diff = not_in_secondary.count()
    secondary_report['diff'] = secondary_diff

    not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date))
    os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date))
    os.system('wc -l collects_{}_secondary_report.csv'.format(run_date))

    process_report['primary'] = primary_report
    process_report['secondary'] =  secondary_report

    print("=" * 100)
    print('\n')
    print(process_report)
    print('\n')
    print("=" * 100)
    spark_details(sc)
    

    sc.stop()
THREDSHOLD = 0.6

sc = SparkContext.getOrCreate()

sc.addFile(
    's3://s3-cdp-prod-airflow-dag/1.10/artifacts/brandnorm/cqi_brand/python/ner/crf.model.5Feat_33018Pos_11350Neg'
)
sc.addFile(
    's3://s3-cdp-prod-airflow-dag/1.10/artifacts/brandnorm/cqi_brand/python/ner/brand_dict.txt'
)
#sc.addPyFile('crfTaggerManager.py')

spark = SparkSession(sc)

with open(SparkFiles.get('brand_dict.txt'), 'r') as infile:
    brandDictMap = json.load(infile)

from crf_title_tagger import extract_features

# load logger
from log_factory import *
logger = get_prod_logger()


def get_raw_df():
    df = spark.read.orc('s3://s3-cdp-prod-hive/temp/cqi_item_brand_field_no_extraction/')\
        .select('item_id','original_category_codes','level_one_category_codes','level_two_category_codes','original_product_name','original_brand')\
        .toDF('itemId','originalCategory','levelOneCategory','levelTwoCategory','originalProductName','originalbrand')
    return df
예제 #34
0
def driver(sc, inputFilename, outputDirectory, 
           crfExecutable, crfScript, 
           featureListFilename, crfModelFilename, 
           eyeColorRef, eyeColorConfig, hairRef, hairConfig, 
           limit=limit, location='hdfs', outputFormat="text", partitions=None):
    dump = False
    partitions = None

    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile(crfExecutable)
    sc.addFile(crfScript)
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
    smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    if location == "hdfs":
        print "We want to do hdfs dfs -rm -r %s" % outputDirectory
    elif location == "local":
        try:
            shutil.rmtree(outputDirectory)
            print "rmtree %s" % outputDirectory
        except:
            pass
    else:
        raise RuntimeError("No such location: %s" % location)

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    origSize = rdd_sequence_file_input.count()
#     if limit:
#         rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
    print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())

    rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x))
    rdd_json.setName('rdd_json')
    # rdd_json.persist()

    # all below should also be done for title
    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body.setName('rdd_body')
    # rdd_body.persist()
    if dump:
        rdd_body.saveAsTextFile(ff("body"))
        
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))
    rdd_body_tokens.setName('rdd_body_tokens')
    # rdd_body_tokens.persist()
    if dump:
        rdd_body_tokens.saveAsTextFile(ff("body_tokens"))

    rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True)))
    rdd_features.setName('rdd_features')
    # rdd_features.persist()
    if dump:
        rdd_features.saveAsTextFile(ff("features"))
    
    # rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x)))
    rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x))
    rdd_pipeinput.setName('rdd_pipeinput')
    # rdd_pipeinput.persist()
    if dump:
        rdd_pipeinput.values().saveAsTextFile(ff("pi"))
    # This caused a cannot concatenate string + None error
    # rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput")

    # DON'T USE SparkFiles.get to fetch the crf_test or model
    # This only works with local Spark (--master local[*])
    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd
    rdd_pipeinput.saveAsTextFile(ff("before"))
    exit(0)

    rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd)
    rdd_crf_b64.setName('rdd_crf_b64')
    # rdd_crf_b64.persist()
    if dump:
        rdd_crf_b64.saveAsTextFile(ff("po"))

    # Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors, 
    # with empty string suffix indicating blank line
    # This is key for avoiding the groupBy step
    rdd_restore = rdd_crf_b64.map(lambda x: restore(x))
    rdd_restore.setName('rdd_restore')
    # rdd_restore.persist()
    if dump:
        rdd_restore.saveAsTextFile(ff("restore"))

    # ### WE NO LONGER HAVE TO GROUPBY
    # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
    # rdd_withuri = sc.parallelize(rdd_withuri.take(10))

    rdd_harvested = rdd_restore.mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1])
    rdd_harvested.setName('rdd_harvested')
    # rdd_harvested.persist()
    if dump:
        rdd_harvested.saveAsTextFile(ff("harvested"))

    # This has the effect of generating 0, 1, 2, ... lines according to the number of spans
    rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x))
    rdd_controlled.setName('rdd_controlled')
    # rdd_controlled.persist()

    # map any eyeColor spans using smEyeColor, hairType spans using smHairColor
    rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor}))
    rdd_aligned.setName('rdd_aligned')
    # rdd_aligned.persist()
    if dump:
        rdd_aligned.saveAsTextFile(ff("aligned"))

    rdd_aligned_json = rdd_aligned.mapValues(lambda x: json.dumps(x))
    rdd_aligned_json.setName('rdd_aligned_json')
    # rdd_aligned_json.persist()
    if dump:
        rdd_aligned_json.saveAsTextFile(ff("aligned_json"))

    rdd_final = rdd_aligned_json
    empty = rdd_final.isEmpty()
    if not empty:
        l = "unknown>1"
        print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory)
        # print len(rdd_final.collect())
        if outputFormat == "sequence":
            rdd_final.saveAsSequenceFile(outputDirectory)
        elif outputFormat == "text":
            rdd_final.saveAsTextFile(outputDirectory)
        else:
            raise RuntimeError("Unrecognized output format: %s" % outputFormat)
    else:
        print "### No records: no output into %s" % (outputDirectory)
h2oContext = H2OContext(sc).start()
# Define file names
chicagoAllWeather = "chicagoAllWeather.csv"
chicagoCensus = "chicagoCensus.csv"
chicagoCrimes10k = "chicagoCrimes10k.csv"

# Add files to Spark Cluster
sc.addFile(_locate(chicagoAllWeather))
sc.addFile(_locate(chicagoCensus))
sc.addFile(_locate(chicagoCrimes10k))

# Since we have already loaded files into spark, we have to use h2o.upload_file instead of h2o.import_file since
# h2o.import_file expects cluster-relative path (ie. the file on this path can be accessed from all the machines on the cluster)
# but SparkFiles.get(..) already give us relative path to the file on a current node which h2o.upload_file can handle ( it uploads file
# located on current node and distributes it to the H2O cluster)
f_weather = h2o.upload_file(SparkFiles.get(chicagoAllWeather))
f_census = h2o.upload_file(SparkFiles.get(chicagoCensus))
f_crimes = h2o.upload_file(SparkFiles.get(chicagoCrimes10k))


# Transform weather table
# Remove 1st column (date)
f_weather = f_weather[1:]

# Transform census table
# Remove all spaces from column names (causing problems in Spark SQL)
col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names)

# Update column names in the table
# f_weather.names = col_names
f_census.names = col_names
# ## Dataframe Basics

# In[1]:

# Import our SparkSession so we can use it
from pyspark.sql import SparkSession
# Create our SparkSession, this can take a couple minutes locally
spark = SparkSession.builder.appName("basics").getOrCreate()

# In[ ]:

# Load in data
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/food.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("food.csv"), sep=",", header=True)
df.show()

# In[4]:

# Let's show the data
df.show()

# In[5]:

# Print our schema
df.printSchema()

# In[6]:

# Show the columns
sc = SparkContext('local', 'testGeoSpark')

#X = sys.argv[1]

#normal
normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
normalPath = os.path.join(normalFilePath)
sc.addFile(normalPath);

#attack
attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
attackPath = os.path.join(attackFilePath)
sc.addFile(attackPath)

from pyspark import SparkFiles
normalRdd = sc.textFile(SparkFiles.get(normalFilePath))
attackRdd = sc.textFile(SparkFiles.get(attackFilePath))


import geoip2.database
geoDBpath = '/home/worker/workspace/geoDB/GeoLite2-City.mmdb'
geoPath = os.path.join(geoDBpath)
sc.addFile(geoPath)
#reader = geoip2.database.Reader(SparkFiles.get(geoPath))
#reader = geoip2.database.Reader('GeoLite2-City.mmdb')

# def ip2city(ip):
#     try:
#         city = reader.city(ip).city.name
#     except:
#         city = 'not found'
예제 #38
0
def send2monit(data):
    """
    Helper function which wraps StompAMQ and incoming dataframe into
    notification message. Then it sends it to AMQ end-point provided
    by credentials file.
    """
    import os
    import stomp
    import time
    import uuid
    import logging

    class StompyListener(object):
        """
        Auxiliar listener class to fetch all possible states in the Stomp
        connection.
        """
        def __init__(self):
            self.logr = logging.getLogger(__name__)

        def on_connecting(self, host_and_port):
            self.logr.info('on_connecting %s', str(host_and_port))

        def on_error(self, headers, message):
            self.logr.info('received an error %s %s', str(headers),
                           str(message))

        def on_message(self, headers, body):
            self.logr.info('on_message %s %s', str(headers), str(body))

        def on_heartbeat(self):
            self.logr.info('on_heartbeat')

        def on_send(self, frame):
            self.logr.info('on_send HEADERS: %s, BODY: %s ...',
                           str(frame.headers),
                           str(frame.body)[:160])

        def on_connected(self, headers, body):
            self.logr.info('on_connected %s %s', str(headers), str(body))

        def on_disconnected(self):
            self.logr.info('on_disconnected')

        def on_heartbeat_timeout(self):
            self.logr.info('on_heartbeat_timeout')

        def on_before_message(self, headers, body):
            self.logr.info('on_before_message %s %s', str(headers), str(body))

            return (headers, body)

    class StompAMQ(object):
        """
        Class to generate and send notifications to a given Stomp broker
        and a given topic.

        :param username: The username to connect to the broker.
        :param password: The password to connect to the broker.
        :param producer: The 'producer' field in the notification header
        :param topic: The topic to be used on the broker
        :param host_and_ports: The hosts and ports list of the brokers.
            E.g.: [('agileinf-mb.cern.ch', 61213)]
        """

        # Version number to be added in header
        _version = '0.1'

        def __init__(self,
                     username,
                     password,
                     producer='CMS_WMCore_StompAMQ',
                     topic='/topic/cms.jobmon.wmagent',
                     host_and_ports=None,
                     verbose=0):
            self._host_and_ports = host_and_ports or [
                ('agileinf-mb.cern.ch', 61213)
            ]
            self._username = username
            self._password = password
            self._producer = producer
            self._topic = topic
            self.verbose = verbose

        def send(self, data):
            """
            Connect to the stomp host and send a single notification
            (or a list of notifications).

            :param data: Either a single notification (as returned by
                `make_notification`) or a list of such.

            :return: a list of successfully sent notification bodies
            """

            conn = stomp.Connection(host_and_ports=self._host_and_ports)
            conn.set_listener('StompyListener', StompyListener())
            try:
                conn.start()
                conn.connect(username=self._username,
                             passcode=self._password,
                             wait=True)
            except stomp.exception.ConnectFailedException as exc:
                print("ERROR: Connection to %s failed %s" %
                      (repr(self._host_and_ports), str(exc)))
                return []

            # If only a single notification, put it in a list
            if isinstance(data, dict) and 'topic' in data:
                data = [data]

            successfully_sent = []
            for notification in data:
                body = self._send_single(conn, notification)
                if body:
                    successfully_sent.append(body)

            if conn.is_connected():
                conn.disconnect()

            print('Sent %d docs to %s' %
                  (len(successfully_sent), repr(self._host_and_ports)))
            return successfully_sent

        def _send_single(self, conn, notification):
            """
            Send a single notification to `conn`

            :param conn: An already connected stomp.Connection
            :param notification: A dictionary as returned by `make_notification`

            :return: The notification body in case of success, or else None
            """
            try:
                body = notification.pop('body')
                destination = notification.pop('topic')
                conn.send(destination=destination,
                          headers=notification,
                          body=json.dumps(body),
                          ack='auto')
                if self.verbose:
                    print('Notification %s sent' % str(notification))
                return body
            except Exception as exc:
                print('ERROR: Notification: %s not send, error: %s' % \
                              (str(notification), str(exc)))
                return None

        def make_notification(self, payload, id_, producer=None):
            """
            Generate a notification with the specified data

            :param payload: Actual notification data.
            :param id_: Id representing the notification.
            :param producer: The notification producer.
                Default: StompAMQ._producer

            :return: the generated notification
            """
            producer = producer or self._producer

            notification = {}
            notification['topic'] = self._topic

            # Add headers
            headers = {
                'type': 'cms_wmagent_info',
                'version': self._version,
                'producer': producer
            }

            notification.update(headers)

            # Add body consisting of the payload and metadata
            body = {
                'payload': payload,
                'metadata': {
                    'timestamp': int(time.time()),
                    'id': id_,
                    'uuid': str(uuid.uuid1()),
                }
            }
            notification['body'] = body
            return notification

    # main function logic
    with open(SparkFiles.get('amq_broker.json')) as istream:
        creds = json.load(istream)
        host, port = creds['host_and_ports'].split(':')
        port = int(port)
        amq = StompAMQ(creds['username'], creds['password'], \
            creds['producer'], creds['topic'], [(host, port)])
        arr = []
        for idx, row in enumerate(data):
            #            if  not idx:
            #                print("### row", row, type(row))
            doc = json.loads(row)
            hid = doc.get("hash", 1)
            arr.append(amq.make_notification(doc, hid))
        amq.send(arr)
        print("### Send %s docs to CERN MONIT" % len(arr))

def mapper(line, title, secfile, idsec):
    post = mdb.posts
    tokens = word_tokenize(line)
    tagged = pos_tag(tokens)
    ntities = chunk.ne_chunk(tagged)
    newline = line.encode('utf-8')

    posting = {"securitynow_id": idsec, "episode": secfile[3:6], "speaker": title, "original": line, "tokens": tokens,
               "entities": ntities, "sentiment": classifier.classify(dict([(word, True) for word in newline]))}
    post_id = post.insert(posting)


sc.addFile("/home/th3m4d0n3/NetBeansProjects/twAppDemo/data_dir/allSentimentData")
with open(SparkFiles.get("allSentimentData")) as f:
    reader = csv.reader(f, delimiter=" ", quotechar='"')

    jobs = bg.BackgroundJobManager()
    map(parseForNltk, reader)

    print("chezdata type DATA: {0} COUNT: {1}".format(type(chezdata), len(chezdata)))

    map(getHighest, chezdata)

    chezdataP = sc.parallelize(chezdata)
    lowRatedP = sc.parallelize(lowRated)
    highlyRatedP = sc.parallelize(highlyRated)

    print("chezdataP type DATA: {0} COUNT: {1}".format(type(chezdataP), chezdataP.count()))
    print("lowRatedP type DATA: {0} COUNT: {1}".format(type(lowRatedP), lowRatedP.count()))
예제 #40
0
# Can I perist a Caffe network object?
import copy
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles
from pyspark import StorageLevel

conf = SparkConf().setAppName("SparkCaffe Test")
conf.set("spark.executor.memory", "1g")
sc = SparkContext(conf=conf)

sc.addFile("models/solver.prototxt")
sc.addFile("models/train_val.prototxt")

solver = SparkFiles.get("solver.prototxt")
architecture = SparkFiles.get("train_val.prototxt")


def create_net(solver_filename):
    from caffe import SGDSolver

    net = SGDSolver(str(solver_filename)).net
    return net


netRDD = sc.parallelize([solver] * 2, 2).map(create_net)

netRDD.persist(StorageLevel.MEMORY_ONLY)


def extract_unique_val(net):
    return net.params["conv1"][0].data[0, 0, 0, 0]
# -*- coding: utf-8 -*-
import os
from pyspark import SparkContext, SparkFiles

# apache spark test job

# Write a Spark job to count the occurrences of each word in a text file. Document that it works with a small example.
CONNECTION_STR = "spark://" + os.environ[
    "SPARK_MASTER_PORT_7077_TCP_ADDR"] + ":" + os.environ[
        "SPARK_MASTER_ENV_SPARK_MASTER_PORT"]
sc = SparkContext(CONNECTION_STR, "test")

sc.addFile(
    os.path.join(os.path.dirname(os.path.realpath(__file__)),
                 "exercise1_data.txt"))

with open(SparkFiles.get("exercise1_data.txt")) as testFile:
    lines = sc.parallelize(testFile.readlines())

words = lines.flatMap(lambda s: s.split())  #Splitting the lines into words
pairs = words.map(
    lambda s: (s, 1)
)  #Creating a pair for each word in the form (word, count) where count is the occurrence of the word, set to 1
counts = pairs.reduceByKey(lambda a, b: a + b)  #Counting the words

res = sorted(counts.collect(), key=lambda tup: tup[1],
             reverse=True)  #Get the result and sort it based on count
for (word, count) in res:
    print("'{0}' has an occurrences of {1}".format(
        word, count))  #Print out the result
sqlContext = SQLContext(sc)
sqlContext.setConf('spark.sql.shuffle.partitions', '7')

from pyspark.sql import SparkSession
import pyspark.sql as sparksql
from pyspark import SparkFiles

spark = SparkSession.builder.appName('stroke').getOrCreate()

from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/'


train = sqlContext.read.csv(SparkFiles.get("/content/"+root_path+"train_2v.csv"), header=True, inferSchema=True)



# explain about Infer Schema
train.groupBy('label').count().show()

train.printSchema()

train.dtypes


train.describe().show()

# create DataFrame as a temporary view for SQL queries
train.createOrReplaceTempView('table')
예제 #43
0
def fun(iterable):
    with open(SparkFiles.get('num_data')) as f:
        value = int(f.readline())
        return [x * value for x in iterable]
예제 #44
0
distScript = os.getcwd()+"/src/R/finddistance.R"
distScriptName = "finddistance.R"
sc.addFile(distScript)


def hasDistInfo(call):
    """Verify that a call has the fields required to compute the distance"""
    requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
    return all(map(lambda f: call[f], requiredFields))


def formatCall(call):
    """Format a call so that it can be parsed by our R program"""
    return "{0},{1},{2},{3}".format(
        call["mylat"], call["mylong"],
        call["contactlat"], call["contactlong"])

pipeInputs = contactsContactList.values().flatMap(
    lambda calls: map(formatCall, filter(hasDistInfo, calls)))
distances = pipeInputs.pipe(SparkFiles.get(distScriptName))
print distances.collect()
# Convert our RDD of strings to numeric data so we can compute stats and
# remove the outliers.
distanceNumerics = distances.map(lambda string: float(string))
stats = distanceNumerics.stats()
stddev = stats.stdev()
mean = stats.mean()
reasonableDistances = distanceNumerics.filter(
    lambda x: math.fabs(x - mean) < 3 * stddev)
print reasonableDistances.collect()
def create_low_test_skycomponents_from_gleam(flux_limit=0.1, polarisation_frame=PolarisationFrame("stokesI"),
                                             frequency=numpy.array([1e8]), kind='cubic', phasecentre=None,
                                             radius=1.0, nchan=16) \
        -> List[Skycomponent]:
    """Create sky components from the GLEAM survey

    Stokes I is estimated from a cubic spline fit to the measured fluxes. The polarised flux is always zero.
    
    See http://www.mwatelescope.org/science/gleam-survey The catalog is available from Vizier.
    
    VIII/100   GaLactic and Extragalactic All-sky MWA survey  (Hurley-Walker+, 2016)

    GaLactic and Extragalactic All-sky Murchison Wide Field Array (GLEAM) survey. I: A low-frequency extragalactic
    catalogue. Hurley-Walker N., et al., Mon. Not. R. Astron. Soc., 464, 1146-1167 (2017), 2017MNRAS.464.1146H


    :rtype: Union[None, List[arl.data.data_models.Skycomponent], List]
    :param flux_limit: Only write components brighter than this (Jy)
    :param polarisation_frame: Polarisation frame (default PolarisationFrame("stokesI"))
    :param frequency: Frequencies at which the flux will be estimated
    :param kind: Kind of interpolation (see scipy.interpolate.interp1d) Default: linear
    :param phasecentre: Desired phase centre (SkyCoord) default None implies all sources
    :param radius: Radius of sources selected around phasecentre (default 1.0 rad)
    :return: List of Skycomponents
    """

    # fitsfile = arl_path("data/models/GLEAM_EGC.fits")
    #
    # rad2deg = 180.0 / numpy.pi
    # decmin = phasecentre.dec.to('deg').value - rad2deg * radius / 2.0
    # decmax = phasecentre.dec.to('deg').value + rad2deg * radius / 2.0
    #
    # hdulist = fits.open(fitsfile, lazy_load_hdus=False)
    # recs = hdulist[1].data[0].array
    #
    # # Do the simple forms of filtering in pyfits. Filtering on radious is done below.
    # fluxes = recs['peak_flux_wide']
    #
    # mask = fluxes > flux_limit
    # filtered_recs = recs[mask]
    #
    # decs = filtered_recs['DEJ2000']
    # mask = decs > decmin
    # filtered_recs = filtered_recs[mask]
    #
    # decs = filtered_recs['DEJ2000']
    # mask = decs < decmax
    # filtered_recs = filtered_recs[mask]
    #
    # ras = filtered_recs['RAJ2000']
    # decs = filtered_recs['DEJ2000']
    # names = filtered_recs['Name']
    #
    # if polarisation_frame is None:
    #     polarisation_frame = PolarisationFrame("stokesI")
    #
    # npol = polarisation_frame.npol
    #
    # nchan = len(frequency)
    #
    # # For every source, we read all measured fluxes and interpolate to the
    # # required frequencies
    # gleam_freqs = numpy.array([76, 84, 92, 99, 107, 115, 122, 130, 143, 151, 158, 166, 174, 181, 189, 197, 204,
    #                            212, 220, 227])
    # gleam_flux_freq = numpy.zeros([len(names), len(gleam_freqs)])
    # for i, f in enumerate(gleam_freqs):
    #     gleam_flux_freq[:, i] = filtered_recs['int_flux_%03d' % (f)][:]
    #
    # skycomps = []
    #
    # for isource, name in enumerate(names):
    #     direction = SkyCoord(ra=ras[isource] * u.deg, dec=decs[isource] * u.deg)
    #     if phasecentre is None or direction.separation(phasecentre).to('rad').value < radius:
    #
    #         fint = interpolate.interp1d(gleam_freqs * 1.0e6, gleam_flux_freq[isource, :], kind=kind)
    #         flux = numpy.zeros([nchan, npol])
    #         flux[:, 0] = fint(frequency)
    #         if not numpy.isnan(flux).any():
    #             skycomps.append(Skycomponent(direction=direction, flux=flux, frequency=frequency,
    #                                          name=name, shape='Point',
    #                                          polarisation_frame=polarisation_frame))
    #
    # log.info('create_low_test_skycomponents_from_gleam: %d sources above flux limit %.3f' % (len(skycomps), flux_limit))
    #
    # hdulist.close()

    with open(SparkFiles.get("sc" + str(nchan)), "rb") as f:
        skycomps = pickle.load(f)
        f.close()
    return skycomps
import spa_utils
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark import SparkFiles
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
import pyspark.sql.types as sql_type

app_name = os.path.basename(__file__)
spark = SparkSession.builder.appName(
    app_name).enableHiveSupport().getOrCreate()
spark.sparkContext.addPyFile('logging.conf')
spark.sparkContext.addPyFile('params.yaml')
spark.sparkContext.addPyFile('SPA_baseline_functions.py')

logging.config.fileConfig(SparkFiles.get('logging.conf'))  # 读取logging的配置信息
logger = logging.getLogger(app_name)  # logger名称与任务名称相同

# 读取任务参数
params = spa_utils.load_params()
logger.info('parameter file loaded')
logger.debug(params)

# 命令行参数有更高的优先级
if len(sys.argv) >= 2:
    params['update_origin'] = sys.argv[1]
    params['update_end'] = sys.argv[1]
if len(sys.argv) >= 3:
    params['update_end'] = sys.argv[2]
if len(sys.argv) >= 4:
    params['write_mode'] = sys.argv[3]
예제 #47
0
from pyspark import SparkContext
from pyspark import SparkFiles
finddistance = "/home/maria_dev/finddistance.R"
finddistancename = "finddistance.R"
sc = SparkContext("local", "SparkFile App")
sc.addFile(finddistance)
print "Absolute Path -> %s" % SparkFiles.get(finddistancename)
예제 #48
0
 def upload_file_job(context):
     from pyspark import SparkFiles
     with open(SparkFiles.get(upload_file_name)) as testFile:
         file_val = testFile.readline()
     return file_val
예제 #49
0
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import autocorrelation_plot
from pyspark.ml.regression import LinearRegression
from datetime import timedelta, datetime
import time

spark = SparkSession.builder \
    .master("local[1]") \
    .appName("Vaccinations") \
    .getOrCreate()

# LOADING DATA
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/us_state_vaccinations.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv("file://" + SparkFiles.get("us_state_vaccinations.csv"),
                    header=True,
                    inferSchema=True).select("date", "location",
                                             "daily_vaccinations")

# ONLY INCLUDES ROWS WHERE date, location, and daily_vaccinations are present
print("all rows", df.count())
df = df.na.drop()
print("only non null", df.count())

# CONVERT TO DATE TYPE
df = df.select('*', unix_timestamp(df.date.cast('date')).alias('time'))

# GET LIST OF ALL STATES
states = df.rdd.map(lambda x: x.location).distinct().collect()
예제 #50
0
# get(filename)
# It specifies the path of the file that is added through SparkContext.addFile().

# getrootdirectory()
# It specifies the path to the root directory, which contains the file that is added through the SparkContext.addFile().
from pyspark import SparkContext
from pyspark import SparkFiles
file = "/Users/zhangyong/pyspark_learning/README.md"
filename = "README.md"
sc = SparkContext("local", "SparkFile App")
sc.addFile(file)
print "Absolute Path -> %s" % SparkFiles.get(filename)
# Import library
import pyspark
from pyspark.sql import SQLContext
from pyspark import SparkFiles

# Setting pyspark to variable to leverage its functionality
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)

# Reading data file in data frame in the form of RDD (Resilient Distributed Dataset)
df = sqlContext.read.csv(SparkFiles.get("D:/Masters/KDM/data.csv"),
                         header=True,
                         inferSchema=True)

# RDD Actions
# Collect all the information present in the data set
action_1 = df.collect()
print("Action 1: Collecting all the information present within the data set")
print(action_1)
# Count number of elements in the data set
action_2 = df.count()
print("Action 2: Count the number of data points present within the data set")
print(action_2)
# Return first 'n' number of elements from the data set
action_3 = df.take(2)
print(
    "Action 3: Taking out 'n' number of data points from the entire data set")
print(action_3)

# RDD Transformations with Actions
# Grouping data and counting their total based on the 'Contract' feature
def _getCountryByIP(ip):
    citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb'))
    return (citydb.city(ip).country.name or u'Unknown').encode()
예제 #53
0
 def _resolvepath(self, p):
     if self._conf['use_sparkfiles']:
         from pyspark import SparkFiles
         return SparkFiles.get(p)
     else:
         return p
예제 #54
0
#X = sys.argv[1]

#normal
normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
normalPath = os.path.join(normalFilePath)
sc.addFile(normalPath);

#attack
attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
attackPath = os.path.join(attackFilePath)
sc.addFile(attackPath);


from pyspark import SparkFiles
normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache()
attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache()

# src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6)
normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()

#(ip, count)
normalTopXSrcIP = normalRaw.map(lambda x:(x[0], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v)
attackTopXSrcIP = attackRaw.map(lambda x:(x[0], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v)

#(ip, count)
normalTopXDstIP = normalRaw.map(lambda x:(x[1], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v)
attackTopXDstIP = attackRaw.map(lambda x:(x[1], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v)

#(ip, data_length)
예제 #55
0
 def find_neighbors(iter):
     t = AnnoyIndex(rank)
     t.load(SparkFiles.get("index.ann"))
     # search_k
     return ((x[0] - 1, t.get_nns_by_item(x[0] - 1, int(number_recs)))
             for x in iter)
예제 #56
0
파일: driver2.py 프로젝트: cjsanjay/dig-crf
def driver(sc, inputFilename, outputDirectory, 
           crfExecutable, crfScript, 
           featureListFilename, crfModelFilename, 
           eyeColorRef, eyeColorConfig, hairRef, hairConfig, 
           limit=limit, location='hdfs', outputFormat="text", partitions=None):
    dump = False
    partitions = 8

    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile(crfExecutable)
    sc.addFile(crfScript)
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
    smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    if location == "hdfs":
        print "We want to do hdfs dfs -rm -r %s" % outputDirectory
    elif location == "local":
        try:
            shutil.rmtree(outputDirectory)
            print "rmtree %s" % outputDirectory
        except:
            pass
    else:
        raise RuntimeError("No such location: %s" % location)

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    origSize = rdd_sequence_file_input.count()
#     if limit:
#         rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
    print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())

    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd

    # ### WE NO LONGER HAVE TO GROUPBY
    # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
    # rdd_withuri = sc.parallelize(rdd_withuri.take(10))

    rdd_final = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)).mapValues(lambda x: extract_body(x)).mapValues(lambda x: textTokens(x)).map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))).mapValues(lambda x: base64.b64encode(vectorToString(x))).values().pipe(cmd).map(lambda x: restore(x)).mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]).flatMapValues(lambda x: list(x)).mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})).mapValues(lambda x: json.dumps(x))

    empty = rdd_final.isEmpty()
    if not empty:
        l = "unknown>1"
        print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory)
        print len(rdd_final.collect())
#         if outputFormat == "sequence":
#             rdd_final.saveAsSequenceFile(outputDirectory)
#         elif outputFormat == "text":
#             rdd_final.saveAsTextFile(outputDirectory)
#         else:
#             raise RuntimeError("Unrecognized output format: %s" % outputFormat)
    else:
        print "### No records: no output into %s" % (outputDirectory)
예제 #57
0
distScript = os.getcwd() + "/src/R/finddistance.R"
distScriptName = "finddistance.R"
sc.addFile(distScript)


def hasDistInfo(call):
    """Verify that a call has the fields required to compute the distance"""
    requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
    return all(map(lambda f: call[f], requiredFields))


def formatCall(call):
    """Format a call so that it can be parsed by our R program"""
    return "{0},{1},{2},{3}".format(call["mylat"], call["mylong"],
                                    call["contactlat"], call["contactlong"])


pipeInputs = contactsContactList.values().flatMap(
    lambda calls: map(formatCall, filter(hasDistInfo, calls)))
distances = pipeInputs.pipe(SparkFiles.get(distScriptName))
print distances.collect()
# Convert our RDD of strings to numeric data so we can compute stats and
# remove the outliers.
distanceNumerics = distances.map(lambda string: float(string))
stats = distanceNumerics.stats()
stddev = stats.stdev()
mean = stats.mean()
reasonableDistances = distanceNumerics.filter(
    lambda x: math.fabs(x - mean) < 3 * stddev)
print reasonableDistances.collect()
def create_named_configuration(name: str = 'LOWBD2',
                               **kwargs) -> Configuration:
    """ Standard configurations e.g. LOWBD2, MIDBD2

    :param name: name of Configuration LOWBD2, LOWBD1, LOFAR, VLAA
    :param rmax: Maximum distance of station from the average (m)
    :return:
    
    For LOWBD2, setting rmax gives the following number of stations
    100.0       13
    300.0       94
    1000.0      251
    3000.0      314
    10000.0     398
    30000.0     476
    100000.0    512
    """

    if name == 'LOWBD2':
        location = EarthLocation(lon="116.4999", lat="-26.7000", height=300.0)
        fc = create_configuration_from_file(antfile=arl_path("LOWBD2.csv"),
                                            location=location,
                                            mount='xy',
                                            names='LOWBD2_%d',
                                            diameter=35.0,
                                            **kwargs)
    elif name == 'LOWBD1':
        location = EarthLocation(lon="116.4999", lat="-26.7000", height=300.0)
        fc = create_configuration_from_file(
            antfile=SparkFiles.get("LOWBD1.csv"),
            location=location,
            mount='xy',
            names='LOWBD1_%d',
            diameter=35.0,
            **kwargs)
    elif name == 'LOWBD2-CORE':
        location = EarthLocation(lon="116.4999", lat="-26.7000", height=300.0)
        fc = create_configuration_from_file(
            antfile=arl_path("LOWBD2-CORE.csv"),
            location=location,
            mount='xy',
            names='LOWBD2_%d',
            diameter=35.0,
            **kwargs)
    elif name == 'LOFAR':
        assert get_parameter(kwargs, "meta", False) is False
        fc = create_LOFAR_configuration(
            antfile=arl_path("data/configurations/LOFAR.csv"))
    elif name == 'VLAA':
        location = EarthLocation(lon="-107.6184", lat="34.0784", height=2124.0)
        fc = create_configuration_from_file(
            antfile=arl_path("data/configurations/VLA_A_hor_xyz.csv"),
            location=location,
            mount='altaz',
            names='VLA_%d',
            diameter=25.0,
            **kwargs)
    elif name == 'VLAA_north':
        location = EarthLocation(lon="-107.6184", lat="90.000", height=2124.0)
        fc = create_configuration_from_file(
            antfile=arl_path("data/configurations/VLA_A_hor_xyz.csv"),
            location=location,
            mount='altaz',
            names='VLA_%d',
            diameter=25.0,
            **kwargs)
    else:
        raise ValueError("No such Configuration %s" % name)
    return fc
예제 #59
0
spark.sparkContext.addFile(dataPath + 'data-payment_lookup-csv.csv')
spark.sparkContext.addFile(dataPath + 'data-vendor_lookup-csv.csv')
spark.sparkContext.addFile(
    dataPath + 'data-sample_data-nyctaxi-trips-2009-json_corrigido.json')
spark.sparkContext.addFile(
    dataPath + 'data-sample_data-nyctaxi-trips-2010-json_corrigido.json')
spark.sparkContext.addFile(
    dataPath + 'data-sample_data-nyctaxi-trips-2011-json_corrigido.json')
spark.sparkContext.addFile(
    dataPath + 'data-sample_data-nyctaxi-trips-2012-json_corrigido.json')

# ### Leitura e Correçao da fonte Payment

# In[7]:

df_payment = spark.read.csv(SparkFiles.get("data-payment_lookup-csv.csv"),
                            header=True,
                            sep=",")
df_payment.show(3)

# Verificado que a primeira linha precisa ser ignorada. Inclusao de index para auxiliar a correção.
#
# Utilização do Pandas para a leitura do CSV ignorando a linha de index 0.

# In[8]:

temp = pd.read_csv(SparkFiles.get("data-payment_lookup-csv.csv"),
                   skiprows=[0],
                   sep=',',
                   header=None)
temp.head()