def main(): appName = "langPopCount;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) langTagList = ['<java>', '<javascript>', '<c>', '<c++>', '<c#>', '<python>', '<php>', '<css>', '<html>', '<objective-c>'] resultrdd = sc.emptyRDD() for tag in langTagList: postCountdf = hc.sql("select creationdate, 1 as c from questionpost where tags like '%{tag}%' ".format(tag=tag)) postCountOnYearrdd = postCountdf \ .filter(postCountdf.creationdate != '__none__') \ .withColumn('year', postCountdf.creationdate.substr(0,4)) \ .drop('creationdate') \ .groupBy('year').count() \ .withColumnRenamed('count', 'c') \ .repartition(1) \ .sort('year', ascending=True) \ .map(lambda _: "{tag} {year} {cnt}".format(tag=tag.strip('<>'), year=_.year, cnt=_.c)) resultrdd = resultrdd.union(postCountOnYearrdd) resultrdd = resultrdd.repartition(1) resultrdd.saveAsTextFile('/sshomework_zl/popCount') sc.stop()
def main(): appName = "langPopCount;zl" conf = (SparkConf().setAppName(appName).set( "spark.executor.memory", "5g").set("spark.executor.cores", "3").set("spark.executor.instance", "3")) sc = SparkContext(conf=conf) hc = HiveContext(sc) langTagList = [ '<java>', '<javascript>', '<c>', '<c++>', '<c#>', '<python>', '<php>', '<css>', '<html>', '<objective-c>' ] resultrdd = sc.emptyRDD() for tag in langTagList: postCountdf = hc.sql( "select creationdate, 1 as c from questionpost where tags like '%{tag}%' " .format(tag=tag)) postCountOnYearrdd = postCountdf \ .filter(postCountdf.creationdate != '__none__') \ .withColumn('year', postCountdf.creationdate.substr(0,4)) \ .drop('creationdate') \ .groupBy('year').count() \ .withColumnRenamed('count', 'c') \ .repartition(1) \ .sort('year', ascending=True) \ .map(lambda _: "{tag} {year} {cnt}".format(tag=tag.strip('<>'), year=_.year, cnt=_.c)) resultrdd = resultrdd.union(postCountOnYearrdd) resultrdd = resultrdd.repartition(1) resultrdd.saveAsTextFile('/sshomework_zl/popCount') sc.stop()
def run(self, inputFile): sc = SparkContext("local[8]", "ratings") text = sc.textFile(inputFile) header = text.first() #extract header text = text.filter(lambda row : row != header) mapping = text.map(lambda line: line.split(',')).map(lambda x: (int(x[0]),int(x[1]))).groupByKey() movieInput = text.map(lambda line: line.split(',')).map(lambda x: (int(x[1]),int(x[0]))).groupByKey().map(lambda x: (x[0],list(x[1]))).sortByKey() self.totalUsers = len(mapping.collect()) print(self.totalUsers) self.generate_hash_functions() #for i in range(10): # print(movieInput.collect()[i]) Signature = movieInput.map(lambda x : (x[0],self.create_signature(x[1]))) #print("Signature") #print(Signature.take(10)) bandSize = self.numHash//self.numBand unique = sc.emptyRDD() self.movieDict = dict(movieInput.collect()) start = time.time() for i in range(self.numBand): bands = Signature.map(lambda x : (x[0], x[1][(i * bandSize) : ((i+1) * bandSize) ])) a = random.randint(1,1500) b = random.randint(1,1000) bands = bands.map(lambda x:(x[0], self.hashBucket(x[1],a,b))).map(lambda x: (x[1],x[0])).groupByKey().map(lambda x:sorted(list(x[1]))).flatMap(lambda x: list(combinations(x,2))) unique = unique.union(bands).distinct() print("flatmap",time.time()- start) unique = unique.distinct().map(lambda x :(x[0], (x[1] ,self.computeJacardSimilarity(x[0],x[1])))).filter(lambda x : x[1][1] >= 0.5) output = unique.groupByKey().sortByKey().map(lambda x: (x[0], sorted(list(x[1]),key=lambda tup: tup[0]))).collect() print("done",time.time()- start) with open('Tuhina_Kumar_SimilarMovie_Jaccard.txt','w') as f: for i in range(len(output)): str1 ='' for j in range(len(output[i][1])): str1 = str1 + str(output[i][0]) +', ' + str(output[i][1][j][0])+', '+ str(output[i][1][j][1])+'\n' f.write(str1) #print("str done",time.time()- start) ##### precision and recall ######## '''
def bandHash(self): sc = SparkContext(appName='Inf553') input_file = sys.argv[1] output_file = sys.argv[2] # rdd = sc.textFile('Input.txt') rdd = sc.textFile(input_file) input_data_rdd = rdd.map(self.inputData) input_rdd = input_data_rdd.map(self.signature) self.inputDict = dict(input_data_rdd.collect()) counter = 0 bandVal = self.numOfSig / self.numOfBands candidatePairs = sc.emptyRDD() for band in range(self.numOfBands): bandRDD = input_rdd.map(lambda x: (x[0], x[1][counter:counter + bandVal])) bandCandidatePairs = bandRDD.map(lambda umDict: (tuple(umDict[ 1]), umDict[0])).groupByKey().map(lambda x: list(x[ 1])).flatMap(lambda x: list(combinations(x, 2))) candidatePairs = candidatePairs.union(bandCandidatePairs) counter = counter + bandVal pairwiseJaccSim = candidatePairs.distinct().map(lambda x: ( x[0], x[1], self.calculationOfJaccard(self.inputDict[x[0]], self.inputDict[x[ 1]]))).flatMap(lambda x: ((x[0], ([(x[1], x[2])])), ((x[1], [( x[0], x[2])])))).reduceByKey(lambda x, y: x + y).sortByKey( ascending=True) outputRDD = pairwiseJaccSim.map(lambda x: (x[0], dict(x[1]))).map( lambda x: (x[0], sorted(x[1].items(), key=lambda k: (-k[1], k[0]))) ).map(lambda x: (x[0], x[1][:5])).map(lambda x: ''.join(s for s in [ 'U' + str(x[0]), ':', ','.join('U' + str(movieName) for movieName in sorted([a[0] for a in x[1]])) ])).collect() # outfile = open('Test.txt', 'w') outfile = open(output_file, 'w') for line in outputRDD: outfile.write(line) outfile.write('\n') outfile.close() sc.stop()
def main(args): arguments = parseArguments(args) if arguments.debug: sys.stdout.write("Arguments: %s\n"%arguments) sc = SparkContext(appName=arguments.app) items = sc.textFile(arguments.input) items = items.map(lambda x: load_item(x)).cache() distances = sc.emptyRDD() for i in xrange(0, arguments.repetition): curr_distances = items.repartitionAndSortWithinPartitions(\ arguments.repartition, \ lambda x: randint(1, arguments.repartition)).\ mapPartitions(lambda x: approx_distance_user(x, arguments.num_reco)) distances = distances.union(curr_distances) distances = distances.reduceByKey(lambda a, b : a + b) distances = distances.map(lambda x: (x[0], sorted(set(x[1]), key = lambda a: -a[1]))) distances = distances.collect() for item_id, rec in distances: print ("%s\t%s"%(item_id, rec))
class TFIDF(): def __init__(self,input_path,output_path): self.input = input_path self.output = output_path self.texts = glob(self.input + '/*.txt') self.conf = SparkConf().setAppName('tfidf')\ .setMaster('local')\ .set('spark.executor.memory','1g') self.sc = SparkContext(conf=self.conf) def writeToCSVFile(self,rdd): with open(self.output + '/tfidf-scores.csv','wb') as csvfile: writer = csv.writer(csvfile) writer.writerow(['docID','word','score']) writer.writerows(rdd) def run(self): # Job 1: Word Frequency in Documents. tfilter = TextFilter().filter wcRDD = self.sc.emptyRDD() for dkey,textfile in enumerate(self.texts): tf = self.sc.textFile(textfile)\ .filter(lambda line: len(line.strip()) > 0)\ .flatMap(lambda line: tfilter(line))\ .map(lambda word: ((word,dkey),1))\ .reduceByKey(operator.add) N = tf.map(lambda ((w,d),y): y).sum() tf = tf.map(lambda ((w,d),y): ((w,d),(y,N))) wcRDD = self.sc.union([wcRDD,tf]) # Job 2: Word Frequency in Corpus & Calculate TF-IDF. D = self.sc.broadcast(len(self.texts)) wcRDD = wcRDD.map(lambda ((w,d),(a,b)): (w,(d,a,b))) wfRDD = wcRDD.map(lambda (w,(d,a,b)): (w,1)).reduceByKey(operator.add) tfidf = wcRDD.join(wfRDD).map(lambda (w,((d,a,b),c)): ((d,-a/b * np.log(D.value/c),w),1))\ .sortByKey(True).map(lambda ((d,z,w),a): (d,w,-z)) self.writeToCSVFile(tfidf.collect())
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october'] for m in months: file_list = [] for path in all_paths: if(fnmatch.fnmatch(path,'*/' + m + twitter_files)): file_list.append(path) # Define Schema so that we can define an empty df schema = StructType([StructField('id',StringType(),True), StructField('num_followers',IntegerType(),True), StructField('num_following',IntegerType(),True), StructField('mentions',ArrayType(StringType(), True),True)]) # Empty df cleaned_df = sqlContext.createDataFrame(sc.emptyRDD(), schema) for f in file_list[:]: df_temp = sqlContext.read.json(f) s_temp = getCols(df_temp) cleaned_df = cleaned_df.unionAll(s_temp) cleaned_df.write.parquet('file:///home/fang/twitter_user_info' + '_' + m[:3]+'.parquet') # for path in all_paths: # if(fnmatch.fnmatch(path, twitter_files)): # file_list.append(path)
.set_extractor(cve_regex_extractor) msid_regex = re.compile('(ms[0-9]{2}-[0-9]{3})', re.IGNORECASE) msid_regex_extractor = RegexExtractor() \ .set_regex(msid_regex) \ .set_metadata({'extractor': 'msid-regex'}) \ .set_include_context(True) \ .set_renamed_input_fields('text') msid_regex_extractor_processor = ExtractorProcessor() \ .set_name('msid_from_extracted_text-regex') \ .set_input_fields('raw_content') \ .set_output_field('extractions.msid') \ .set_extractor(msid_regex_extractor) cdr_extractions_isi_rdd = sc.emptyRDD() extraction_source_names = [] for source in source_extraction_fields: extraction_source_names.append(source) extraction_fields = source_extraction_fields[source] cve_process_source = ExtractorProcessor() \ .set_name('cve_from_extracted_text-regex') \ .set_input_fields(extraction_fields) \ .set_output_field('extractions.cve') \ .set_extractor(cve_regex_extractor) msid_process_source = ExtractorProcessor() \ .set_name('msid_from_extracted_text-regex') \ .set_input_fields(extraction_fields) \ .set_output_field('extractions.msid') \ .set_extractor(msid_regex_extractor)
if len(sys.argv) != 1: print("Usage: Report") exit(-1) conf = SparkConf().set('spark.local.dir', '/data/store/tmp') conf.set('spark.storage.memoryFraction', '0.5') conf.set('spark.akka.frameSize', '256') # Connect to Spark sc = SparkContext(appName="Big Data Report 2015", conf=conf) stopwords = get_stopwords(sc) # Part 1 # Load and parse non-empty files into RDD rawDataRDD = sc.emptyRDD() rdds = [] batchSize = 1000 i = 0 dirs = [] for root, dir, files in os.walk('/data/store/gutenberg/text-full/'): if len(files) == 0: continue # skip empty directories if os.stat(os.path.join(root, files[0])).st_size > 1000000: continue # skip files bigger than 1 megabyte dirs.append(root) print("Got {} dirs - {} chunks".format(len(dirs), int(len(dirs) / batchSize))) countWords = []
correlation = (n * s_xy - s_x * s_y) / (sqrt( (n * s_x2 - s_x**2) * (n * s_y2 - s_y**2))) except: pass return correlation input = sc\ .textFile(args.input)\ .map(lambda line: line.split(','))\ .filter(lambda splits: len(splits) == 8 and splits[0][0] != '#')\ .map(lambda x: (x[0], x[2], int(x[3]), float(x[4])))\ .filter(filter_moment)\ .map(lambda x: (x[0], int(datetime.strptime(x[1], '%Y%m%d%H%M%S%f').timestamp()), x[2], x[3]))\ .cache() result = sc.emptyRDD() for width in args.candle_widths: candles = input\ .map(lambda x: moment_to_candle_start(x, width))\ .reduceByKey(lambda x, y: x if x[0] > y[0] else y) \ .map(lambda x: (x[0][0], (x[0][1], x[1][1])))\ .groupByKey()\ .cache() for shift in args.candle_shifts: shifted_candles = candles\ .map(lambda x: (x[0] + shift * width, x[1])).cache() correlations = candles\ .join(shifted_candles)\
sqlContext = SQLContext(sc) tableName = sc.broadcast(createUniqueTableName('NBM')) MYSQL_CONNECTION_URL = sc.broadcast( 'jdbc:mysql://localhost:3306/' + db + '?user='******'&password='******'&useUnicode=true&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=UTC&useSSL=false' ) # CREATE TABLE SCHEMA schema = StructType([ StructField("gt", IntegerType(), True), StructField("predicted", IntegerType(), True) ]) dfTableSchema = sqlContext.createDataFrame(sc.emptyRDD(), schema) dfTableSchema.write.jdbc(MYSQL_CONNECTION_URL.value, tableName.value, mode='error') # LOAD JDBC PROPERTIES df = sqlContext.read.format('jdbc')\ .options(url = MYSQL_CONNECTION_URL.value, dbtable = db+'.'+tableName.value ).load() # CREATE STREAMING CONTEXT ssc = StreamingContext(sc, int(spark_batch_duration)) # setting checkpoint # ssc.checkpoint(".")
# print(sum(res)) return sum(res) # delete_edge = betweenness.sortBy(lambda x:x[1],False).map(lambda x:(x[0][0],x[0][1])).first() # print(delete_edge) # sets = graph.filter(lambda x: x == delete_edge).flatMap(lambda x: [x[0], x[1]]).map(lambda x:findSet(x, parent_child)) # Q = sets.map(modularity).sum() / (2 * m) # print(len(sets.take(1)[0])) # print(Q) increase = Decimal(1) delete_edge = betweenness.sortBy( lambda x: x[1], False).map(lambda x: (x[0][0], x[0][1])).first() delete_edges = [delete_edge] default_sets = sc.emptyRDD() Q_max = Decimal(0) while increase >= 0.0: new_graph = graph.filter(lambda x: x not in delete_edges) # print(new_graph.count()) children = new_graph.flatMap(lambda x: [(x[0], [x[1]]), (x[1], [x[0]])] ).reduceByKey(lambda x, y: x + y).collect() new_parent_child = dict() for node in children: new_parent_child[node[0]] = node[1] sets = graph.filter(lambda x: x in delete_edges).flatMap( lambda x: [x[0], x[1]]).distinct().map( lambda x: findSet(x, new_parent_child)).distinct() # print(sets.filter(lambda x:len(x)==1).collect()) # print(nodes.subtract(sets.flatMap(lambda x:[x])))
if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: PySparkForumPostTraining.py <classifiers_file> <test_file>", file=sys.stderr) exit(-1) print("Started Classification") sc = SparkContext(appName="PySparkForumPostClassify", pyFiles=['./classifier.py']) classifiers_file_path = sys.argv[1] load_classifiers(classifiers_file_path) test_data_file = sys.argv[2] lines = sc.textFile(test_data_file, 1).zipWithUniqueId() lines.cache() stage0 = sc.emptyRDD() for classifier_key, classifier in classifiers.iteritems(): tmp_rdd = lines.map(lambda x: map_add_classifier(classifier.name,x)) stage0 = stage0.union(tmp_rdd) #print(stage0.first()) # map by value each word in documents. stage1 = stage0.flatMapValues(lambda x: x).filter(filter_inexistent_words) \ .map(map_log_of_probability) \ .reduceByKey(reducer_add) \ .map(map_get_class_prob) \ .reduceByKey(reducer_get_classification) \ .map(map_accuracy_classification) \ .reduceByKey(reducer_add) #print(stage1.take(20))
else: cursor = conn.execute("SELECT count(*) FROM casos"); log("Tabela 'casos' encontrada com sucesso. " + str(len(cursor.fetchall())) + " registros existentes.") sc = SparkContext() sqlContext = SQLContext(sc) field = [StructField("DT_NOTIFIC", StringType(), True), StructField("NU_ANO", IntegerType(), True), StructField("Long_WGS84", StringType(), True), StructField("Lat_WGS84", StringType(), True), ] schema = StructType(field) df = sqlContext.createDataFrame(sc.emptyRDD(), schema) log("Iniciando geração dicionário bairros...") bairrosDict = getBairrosDict(sc) log("Sucesso\n") cols = ['DT_NOTIFIC', 'NU_ANO', 'Long_WGS84', 'Lat_WGS84'] log("Lendo dados chamados dengue...") for y in range(2010, time.localtime()[0]): for m in range(1, 13): fileN = ("Casos_Notificados_Dengue_mes_ano.csv").replace("ano", str(y)).replace("mes", "%02d" % m) if fileN not in arquivosImportados: log("Processando: " + fileN) path = "./" + fileN
return res if __name__ == "__main__": os.system ("hadoop fs -mkdir -p features_selection") os.system ("hadoop fs -mkdir -p features_selection/input_mat") iters = 30 if (len (sys.argv[1].split ('/')) > 1): input_mat = sys.argv[1].split ('/')[1] .split ('.')[0] else: input_mat = sys.argv[1]. split ('.')[0] df = sqlcontext.read. parquet ('hdfs://master:9000/user/hduser/matrix_of_depend/' + input_mat) targets = df. select ("P1"). distinct (). rdd. map (lambda x: x[0]). collect () features = sc.emptyRDD () for target in targets: hubs = select_k_variables (df, target, iters = iters, k=3) features = features.union (hubs. map (lambda x:(x[0], [x[1]]))) features = features. reduceByKey (lambda x,y: x + y ).\ map (lambda x: (x[0].split ('_')[0], x[0].split ('_')[1], x[1])).\ map (lambda x: (x[0], x[1], list_to_str (x[2]))).\ toDF (["Col", "NbV", "Hubs"]) features.write.format("com.databricks.spark.csv").\ mode("overwrite").\ option("header", "true").\ save("/user/hduser/features_selection/" + input_mat)
def aggregate_tags_count(new_values, total_sum): return sum(new_values) + (total_sum or 0) #Metodo para obtener una instancia de SQL def get_sql_context_instance(spark_context): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(spark_context) return globals()['sqlContextSingletonInstance'] from pyspark.sql.types import * schema = StructType([]) sql_context = HiveContext(sc) empty = sql_context.createDataFrame(sc.emptyRDD(), schema) # Metodo para obtener los resultados de la cuenta def process_rdd(_, rdd): try: #Obtiene el singleton de sql sql_context = get_sql_context_instance(rdd.context) #convierte de RDD a Row RDD row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1])) # Crea Dataframe de Row DD hashtags_df = sql_context.createDataFrame(row_rdd) # Register dataframe como tabla hashtags_df.registerTempTable("hashtags") # obtiene los 20 hashtags con mayor frecuencia
from pyspark.sql import Row from pyspark.sql import functions as F from pyspark.ml.feature import RegexTokenizer, StringIndexer from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, LongType, BooleanType, ArrayType from pyspark.ml.clustering import KMeans sc = SparkContext() spark = SparkSession.builder.appName("task2").config( "spark.some.config.option", "some-value").getOrCreate() sqlContext = SQLContext(spark) resub = F.udf(lambda string: re.sub(r'[^\w\s]', '', string), StringType()) #collect data to rdd cluster = sc.textFile("cluster2.txt").collect() inp = sc.emptyRDD() for file in cluster: filePath = "/user/hm74/NYCColumns/" + file.replace("'", "") tmp = sc.textFile(filePath).map(lambda row: row.split("\t")).map( lambda x: (str(x[0]), x[1])) inp = sc.union([inp, tmp]) inp = inp.reduceByKey(lambda x, y: int(x) + int(y)) df = sqlContext.createDataFrame(inp, ['inp', 'count']) df = df.withColumn("sentence", resub(df.inp)) #tokenized words regexTokenized = RegexTokenizer(inputCol="sentence", outputCol="words").transform(df) regexTokenized = regexTokenized.select("sentence", "words", "count")
sql_cart = "select uid,pid,0 as rating from data_cart" sql_favorites = "select uid,pid,0 as rating from data_favorites" cur = conn.cursor() cur.execute(sql_payment) rdd_payment = sc.parallelize(cur.fetchall()) print "rdd_payment.count() = %s" % rdd_payment.count() cur.execute(sql_order) rdd_order = sc.parallelize(cur.fetchall()) print "rdd_order.count() = %s" % rdd_order.count() data_cart = cur.execute(sql_cart) rdd_cart = sc.parallelize(cur.fetchall()) print "rdd_cart.count() = %s" % rdd_cart.count() data_favorites = cur.execute(sql_favorites) rdd_favorites = sc.parallelize(cur.fetchall()) print "rdd_favorites.count() = %s" % rdd_favorites.count() rdd_rating = sc.emptyRDD() rdd_temp = rdd_payment.map(lambda x:(x[0],x[1],10.0)) rdd_rating = rdd_rating.union(rdd_temp) print "rdd_temp_payment.count() = %s" % rdd_temp.count() rdd_temp = rdd_order.subtract(rdd_payment).map(lambda x:(x[0],x[1],8.0)) rdd_rating = rdd_rating.union(rdd_temp) print "rdd_temp_order.count() = %s" % rdd_temp.count() rdd_temp = rdd_cart.subtract(rdd_order).subtract(rdd_payment).map(lambda x:(x[0],x[1],7.0)) rdd_rating = rdd_rating.union(rdd_temp) print "rdd_temp_cart.count() = %s" % rdd_temp.count() rdd_temp = rdd_favorites.subtract(rdd_cart).subtract(rdd_order).subtract(rdd_payment).map(lambda x:(x[0],x[1],5.0)) rdd_rating = rdd_rating.union(rdd_temp) print "rdd_temp_favorites.count() = %s" % rdd_temp.count() print "rdd_rating.count() = %s" % rdd_rating.count() collect = rdd_rating.collect()
try: correlation = (n * s_xy - s_x * s_y) / (sqrt( (n * s_x2 - s_x ** 2) * (n * s_y2 - s_y ** 2) )) except: pass return correlation input = sc\ .textFile(args.input)\ .map(lambda line: line.split(','))\ .filter(lambda splits: len(splits) == 8 and splits[0][0] != '#')\ .map(lambda x: (x[0], x[2], int(x[3]), float(x[4])))\ .filter(filter_moment)\ .map(lambda x: (x[0], int(datetime.strptime(x[1], '%Y%m%d%H%M%S%f').timestamp()), x[2], x[3]))\ .cache() result = sc.emptyRDD() for width in args.candle_widths: candles = input\ .map(lambda x: moment_to_candle_start(x, width))\ .reduceByKey(lambda x, y: x if x[0] > y[0] else y) \ .map(lambda x: (x[0][0], (x[0][1], x[1][1])))\ .groupByKey()\ .cache() for shift in args.candle_shifts: shifted_candles = candles\ .map(lambda x: (x[0] + shift * width, x[1])).cache() correlations = candles\ .join(shifted_candles)\
parser.add_argument("-s","--sigma",type=float,default=5,help="sigma to use for gaussing smoothing") parser.add_argument("-d","--dst",default="/tmp",help="destination path") parser.add_argument("-n","--num_partitions",type=int,default=16,help="number of partitions to create, each with num_files/num_partitions records") parser.add_argument('--nocache', dest='nocache',default=False,action='store_true',help="cache image stack before thresholding") parser.add_argument('--granular', dest='granular',default=False,action='store_true',help="granular image processing operations (vs grouped)") args = parser.parse_args() threshold_percent=args.percent gaussian_sigma=args.sigma sc = SparkContext(appName="APS_Thresholder") import time t0=tbegin=time.time() files=sc.emptyRDD() if args.path != None: filelist = genfilelist(args.path, args.ext) #note: occasional problem here when worker tries to read unsynced nfs file, use filelist instead files = sc.textFile(filelist) else: files=sc.textFile(args.filelist) # threshold_stack.foreach(noop) #useful to force pipeline to execute for debugging # tmark=time.time() # print("generate and read files: %0.6f"%(tmark-to)) # t0=tmark slice_count=files.count() files=files.repartition(args.num_partitions) # Or maybe just slice_count, but I suspect file size plays a significant role in how many records per partition is optimal. stack=files.map(readTiff)
print(rdd.getNumPartitions()) ##creating signature matrix x = rdd.mapPartitions(lambda iterator: create_local_signature_matrix( iterator, user_map)).reduceByKey( lambda x, y: getMinForBusinessFronPartitions(x, y)) signature_matrix = dict(x.collect()) ## creting bands y = x.flatMap(lambda x: divideIntoBands(x)).groupByKey().collect() ## get candidiate pairs #z = y.mapValues(lambda x: getCandiatesForBands(x)).collect() #print(len(z)) candidates = sc.emptyRDD() for i in y: band = sc.parallelize(i[1]) c = band.groupByKey().map(lambda x: list(x[1])).flatMap( lambda x: list(combinations(x, 2))) candidates = candidates.union(c) candidates = candidates.distinct().persist() singleCandidates = candidates.flatMap( lambda x: [x[0], x[1]]).distinct().collect() candidates = candidates.collect() #pairs = candidates.distinct().map(lambda x: (x,calculateSimilarity(signature_matrix[x[0]],signature_matrix[x[1]]))).filter(lambda x: x[1]>=0.5).collect() businesses_copy = rdd.map(lambda x: (x[1], [x[0]])).reduceByKey( lambda x, y: x + y)
def run(cfg): global hive_context sc = SparkContext() hive_context = HiveContext(sc) sc.setLogLevel('WARN') # ESClient requires host ip es_client_booking = ESClient(cfg['es_host'], cfg['es_port'], cfg['es_booking_index'], cfg['es_booking_type']) bookings = es_client_booking.search({}) # get at most 1000 results for now bookings = optimizer.util.filter_valid_bookings(bookings) # adjust dates in bookings bookings = optimizer.util.adjust_booking_dates(bookings) bookings_map = optimizer.util.get_bookings_map(bookings) df = hive_context.createDataFrame(sc.emptyRDD(), optimizer.util.get_common_pyspark_schema()) today = cfg['today'] # YYYY-MM-DD days = optimizer.util.get_days_from_bookings(today, bookings) df = generate_resources(cfg, df, bookings_map, days, bookings, hive_context) # Row(day='2018-04-02', ands=['b1', 'b3', 'b2'], minus=[], allocated={}, amount=43562) print('defining resources') df.cache() print(df.take(1)) # run the allocation df = hwm_allocation(df, bookings, days) # Row(day='2018-04-02', ands=['b1', 'b3', 'b2'], minus=[], amount=43562, allocated={'b2': 800, 'b3': 1000, 'b1': 500}) print('bb-bookings allocation') df.cache() print(df.take(1)) # lock bookings lock_booking(es_client_booking, True) # remove bbs remove_booking_buckets(cfg, days) # save new booking-buckets into es df = save_booking_buckets_in_es(cfg, df) print('bbs saved') df.cache() print(df.take(1)) # unlock bookings lock_booking(es_client_booking, False) day = days[-1] tomorrow = optimizer.util.get_next_date(day) # use only tomorrow to create the allocation plan df = df.filter(df.day == tomorrow) # this method add the bbs ucdocs allocation_map with their values df = add_ucdoc_bb_allocation_map(cfg, df, bookings_map) # [Row(day='2018-04-02', ands=['b1', 'b3', 'b2'], minus=[], amount=43562, allocated={'b2': 800, 'b3': 1000, 'b1': 500}, allocation_map={'minusonepage,3,5G,g_x,2,pt,1002,icc': {'b2': 1, 'b3': 2, 'b1': 1}, 'magazinelock,2,3G,g_x,3,pt,1005,icc': {'b2': 56, 'b3': 70, 'b1': 35}, 'magazinelock,2,4G,g_x,3,pt,1005,icc': {'b2': 56, 'b3': 70, 'b1': 35}, 'minusonepage,3,5G,g_x,2,pt,1003,icc': {'b2': 6, 'b3': 8, 'b1': 4}, 'minusonepage,1,4G,g_x,2,pt,1003,icc': {'b2': 16, 'b3': 20, 'b1': 10}, 'minusonepage,2,4G,g_f,4,pt,1002,icc': {'b2': 12, 'b3': 15, 'b1': 8}, 'cloudFolder,2,5G,g_x,3,pt,1005,icc': {'b2': 57, 'b3': 72, 'b1': 36}, 'minusonepage,2,3G,g_x,3,pt,1002,icc': {'b2': 3, 'b3': 4, 'b1': 2}, 'minusonepage,1,3G,g_x,1,pt,1005,icc': {'b2': 27, 'b3': 33, 'b1': 17}, 'minusonepage,1,3G,g_x,4,pt,1004,icc': {'b2': 72, 'b3': 90, 'b1': 45}, 'magazinelock,2,5G,g_x,4,pt,1004,icc': {'b2': 32, 'b3': 40, 'b1': 20}, 'cloudFolder,2,3G,g_f,3,pt,1002,icc': {'b2': 16, 'b3': 20, 'b1': 10}, 'cloudFolder,3,5G,g_f,2,pt,1004,icc': {'b2': 27, 'b3': 34, 'b1': 17}})] print('ucdocs-bookings allocation') df.cache() print(df.take(1)) # at this point we have a df which is a allocation of bookings to bbs df = df.select(df.day, explode(df.allocation_map)) # Row(day='2018-04-02', key='magazinelock,3,5G,g_x,2,pt,1004,icc', value={'b2': 14, 'b3': 18, 'b1': 9}) print('exploded') df.cache() print(df.take(1)) # agg all the allocation maps for a ucdoc _map_type = MapType(StringType(), IntegerType()) _audf = udf(agg_allocation_maps, _map_type) df = df.groupBy('key').agg(_audf(collect_list('value')).alias('allmap')) # [Row(key='cloudFolder,3,5G,g_f,2,pt,1004,icc', allmap={'b2': 27, 'b3': 34, 'b1': 17})] print('final aggregation') df.cache() print(df.take(1)) # writing into hdfs filename = 'allmap-{}-{}'.format( optimizer.util.convert_date_remove_dash(day), str(int(time.time()))) df.write.save(filename, format='json')
return when(col(x) != "", col(x)).otherwise(0) # Importing and processing our dataset records_rdd = raw_records.map(pre_process) records_df = records_rdd.toDF(schema=["Timestamp", "LineID", "Direction", "JourneyPatternID", "Timeframe",\ "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", "Delay",\ "BlockID", "VehicleID", "StopID", "AtStop"]) records_df_without_empty = records_df.withColumn("LineID", blank_as_null("LineID")) # Creating an empty data-frame for storing coordinates of all the stops within all lineID relevant_fields = [StructField("LineID",StringType(), True),StructField("StopID", StringType(), True),\ StructField("Lon", StringType(), True), StructField("Lat", StringType(), True)] schema = StructType(relevant_fields) all_coordinates_df = sqc.createDataFrame(sc.emptyRDD(), schema) # Remapping records into an RDD by LineID as Key filtered_data_rdd = records_df_without_empty.rdd.map(lambda x: (str(x["LineID"]), (str(x["LineID"]),\ time.ctime(int(str(x["Timestamp"]))/1000000),\ str(x["JourneyPatternID"]),\ int(str(x["VehicleID"])),\ int(str(x["VehicleJourneyID"])),\ int(str(x["Delay"])),\ str(x["Lon"]), str(x["Lat"]),\ str(x["StopID"]),\ int(str(x["AtStop"]))))) # Grouping those records on LineID grouped_by_lineID = filtered_data_rdd.groupByKey().mapValues(list) results_1 = grouped_by_lineID.collect()
header = patient.first() patient = patient.filter(lambda row: row != header) patient = patient.flatMap(pair_patient_to_disease).filter(patient_filter) support_pct = args.min_sup patient_cnt = patient.keys().distinct().count() min_support = round(patient_cnt * support_pct) joined_set = geo.join(patient).map(lambda row: (row[0], row[1][0][0], row[1][0][1])) transaction_set = joined_set.map(lambda s: (s[0], s[1])).groupByKey().map( lambda s: (s[0], set(s[1]))) items = joined_set.map(lambda row: (row[1], row[2])).keys().distinct().collect() #items = set(items) final_set = sc.emptyRDD() for i in range(args.num_iter): itemset_combs = generate_item_combs(items, i + 1) retitems = transaction_set.values().flatMap(lambda row: returnItems(itemset_combs, row)).reduceByKey(sum_patient_count) \ .map(lambda row: (row[0], row[1], min_support)).filter(min_support_filter).map(lambda row: (row[1], row[0])) final_set = final_set.union(retitems) final_set = final_set.sortBy(lambda x: x[0], ascending=False) final_set.map(lambda x: str(x[0]) + '\t' + "\t".join(y for y in x[1]) ).coalesce(1).saveAsTextFile(args.output)
# NOTE: This code requires you to have downloaded simulation snapshots # to an EBS storage attached to your cluster. # If you are interested in getting access to the Caterpillar particle data, # please contact the Caterpillar team / email [email protected] conf = SparkConf().setAppName('project_spark') sc = SparkContext(conf=conf) # getting snapshot number from command line snap = sys.argv[1] # downsampling factor down_max = 0.1 allpos_rdd = sc.emptyRDD() snap3char = str(snap).zfill(3) for i in range(64): # read the file into an numpy array newfile = h5py.File( '/mnt/s3/snapdir_' + snap3char + '/snap_' + snap3char + '.' + str(i) + '.hdf5', 'r') particletypes = newfile.keys()[1:] # loop through all particle types for newtype in particletypes: if newtype == 'PartType1': # load the coordinates of the high-resolution type into an rdd positions = newfile[newtype]['Coordinates'][:] positions_rdd = sc.parallelize(positions) # downsample the rdd typeindex = int(newtype[-1]) - 1
def date_and_day(df): first_timestamp = int(float(df.collect()[0]["Timestamp"]))/1000000 readable_first_timestamp = t.ctime(first_timestamp) day = readable_first_timestamp[0:3] date = readable_first_timestamp[4:10] return day, date # Creating an empty data-frame for storing busy lines data relevant_fields = [StructField("LineID",IntegerType(), True), \ StructField("Number of times at stops", IntegerType(), True), \ StructField("Date",StringType(), True), \ StructField("Day", StringType(), True), \ ] schema = StructType(relevant_fields) busy_lines_df = sqc.createDataFrame(sc.emptyRDD(), schema) # Importing and cleaning our data-set records_rdd = raw_records.map(pre_process) records_df = records_rdd.toDF(schema=["Timestamp", "LineID", "Direction", "JourneyPatternID", "Timeframe", \ "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", "Delay", \ "BlockID", "VehicleID", "StopID", "AtStop"]) records_df = cleaning(records_df) # Getting day and date for this set of records day, date = date_and_day(records_df) # Remapping rdd as a PairRDD with LineID as key records_keyLineID_rdd = records_df.rdd.map(lambda x: (int(str(x["LineID"])), [(int(str(x["LineID"])), \ int(str(x["StopID"])), \ int(float(str(x["Timestamp"]))), \
# If this is the main program if __name__ == "__main__": # Make sure we have all arguments we need if len(sys.argv) != 1: print("Usage: Report") exit(-1) conf = SparkConf().set('spark.local.dir', '/data/store/tmp') # Connect to Spark sc = SparkContext(appName="Big Data Report 2015", conf=conf) stopwords = get_stopwords(sc) # Part 1 # Load and parse non-empty files into RDD rawDataRDD = sc.emptyRDD() rdds = [] batchSize = 1000 i = 0 dirs = [] for root, dir, files in os.walk('/data/store/gutenberg/text-full/'): if len(files) == 0: continue # skip empty directories if os.stat(os.path.join(root, files[0])).st_size > 1000000: continue # skip files bigger than 1 megabyte dirs.append(root) print("Got {} dirs - {} chunks".format(len(dirs), int(len(dirs) / batchSize)))
# treatment_day = datetime.strptime(sys.argv[1], '%Y-%m-%d').date() # source_root = '/home/vlepot/dev/navitia-stat-logger/tmp' # source_root = 'gs://hdp_test' source_root = sys.argv[1] treatment_day_start = datetime.strptime(sys.argv[2], '%Y-%m-%d').date() treatment_day_end = datetime.strptime(sys.argv[3], '%Y-%m-%d').date() print "Go for dates: " + treatment_day_start.strftime('%Y-%m-%d') + " -> " + treatment_day_end.strftime('%Y-%m-%d') print "Source root dir: " + source_root conf = SparkConf().setAppName("coverage_journeys_compiler") sc = SparkContext(conf=conf) statsLines = sc.emptyRDD() treatment_day = treatment_day_start while treatment_day <= treatment_day_end: if source_root.startswith("/") and \ len(glob(source_root + '/' + treatment_day.strftime('%Y/%m/%d') + '/*.json.log*')) > 0: statsLines = statsLines.union(sc.textFile( source_root + '/' + treatment_day.strftime('%Y/%m/%d') + '/*.json.log*') ) treatment_day += timedelta(days=1) dayStats = statsLines.map( lambda stat: json.loads(stat) ).filter( lambda line: line["api"] == 'v1.journeys' )
import argparse import json from datetime import datetime from pyspark import SparkContext ''' parser = argparse.ArgumentParser(description='Process an availability report') parser.add_argument('--in', dest='input') args = parser.parse_args() ''' if __name__ == "__main__": sc = SparkContext(appName="AvailabilityReport") rdd = sc.emptyRDD() for path in sys.argv[1:]: rdd = rdd.union(sc.wholeTextFiles(path)) availabilityTuples = rdd.values() \ .flatMap(lambda fz: fz.split("\n")) \ .filter(lambda u: len(u)>0) \ .map(lambda line: line.split("\t")[2]) \ .distinct() \ .keyBy(lambda u: int(json.loads(u)['sequence']/1E4)) \ .mapValues(lambda u: \ (1, datetime.strptime(json.loads(u)['time'], '%Y-%m-%dT%H:%M:%S.%fZ'), datetime.strptime(json.loads(u)['time'], '%Y-%m-%dT%H:%M:%S.%fZ')) \ ) \ .reduceByKey(lambda a, b: \ (a[0]+b[0], min(a[1], b[1]), max(a[2], b[2]))
class StravaLoader(object): def __init__(self, data_source='local', activity_directory='strava-activities-subset', s3bucket='larsbk', athletes=None, activity_types=[ 'Ride', 'Run', 'NordicSki' ], sc=None, hiveContext=None, conf=(SparkConf().setAppName('Strava analysis')), filter_bug_inducing_rows=True ): ''' Initialize Strava Analysis object''' # INPUT PARAMETERS self.athletes = athletes # Athletes to analyze (optional) self.activity_types = activity_types # Activity_types to consider (default) self.filter_bug_inducing_rows = filter_bug_inducing_rows # CONFIGURE SPARK if sc != None and hiveContext != None: # Both contexts were supplied by user print 'Info: Using supplied SparkContext and HiveContext' self.sc = sc self.hiveContext = hiveContext else: # Initialize new contexts print 'Info: Intitializing SparkContext and hiveContext from (default) conf' self.sc = SparkContext(conf=conf) self.hiveContext = HiveContext(self.sc) self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema self.df = None # Empry DataFrame to be populated later # CONFIGURE DATA SOURCE data_root_path = { 's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 'local': './%s/' % activity_directory } if data_source not in data_root_path.keys(): # Check if data source is valid raise Exception(('Unrecognized data source %s. ' 'Supported sources: "%s".') \ % '", "'.join(data_root_path.keys())) self.data_source = data_source # This is a valid data source self.path = data_root_path[data_source] # This is the path to the data # (S3 SPECIFIC STUFF) if data_source == 's3': # Get a list of files in he activity_directorys bucket = boto3.resource('s3').Bucket(s3bucket) objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory) files = [obj.key for obj in objects] # Make set of observed combinations of athlete and activity_type athlete_and_type = set([]) # Empty set to populate fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern for fname in files: match = re.match(activity_directory+'/gpx'+fpattern, fname) if match: athlete_and_type.add((match.group(1), match.group(2))) self.s3_athlete_and_type = athlete_and_type # Save set for later use pass def _get_athlete_directories(self): ''' Look for athlete directories in data_root_path \ and update self.athletes ''' if self.data_source in ['local']: self.athletes = [ directory for directory in os.listdir(self.path+'gpx/') if re.match('^[\w-]+$', directory) ] else: print ('Warning: Automatic directory/athlete detection not yet supported for ' 'data source %s. Using: "akrogvig", "lkrogvig", "brustad"') \ % self.data_source self.athletes = ['akrogvig', 'lkrogvig', 'brustad'] pass def _activities_exist(self, athlete, activity_type): ''' Checks if there exists activities of type <activity_type> for athlete <athlete>, returns a boolean value ''' # Check local directory with glob if self.data_source == 'local': return glob.glob(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) # Check if combination exists by using previously compiled sets elif self.data_source == 's3': return ((athlete, activity_type) in self.s3_athlete_and_type) def _load_dataset(self): ''' Loads strava activities from source to DataFrame self.df ''' # Get athlete list if not already set if not self.athletes: self._get_athlete_directories() # Initialize empty dataset self.df = self.hiveContext.createDataFrame( self.sc.emptyRDD(), self.schema ) for athlete in self.athletes: for activity_type in self.activity_types: # Check that there are files of that type (or else .load fails) if self._activities_exist(athlete, activity_type): # Read data dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \ .schema(self.schema) \ .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) dfadd = dfadd.withColumn('athlete', lit(athlete)) \ .withColumn('activity_type', lit(activity_type)) self.df = self.df.unionAll(dfadd) if self.filter_bug_inducing_rows: self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull()) pass def derive_schema(self): ''' Loads all data in self.path and derives the schema, saves with pickle to "schema.p" ''' df = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt') \ .load(self.path+'gpx/*') df = df.withColumn('athlete',lit(None).cast(StringType())) \ .withColumn('activity_type',lit(None).cast(StringType())) df.printSchema() pickle.dump(df.schema, open("schema.p", "wb")) pass def get_dataset(self): ''' Returns strava activity dataset ''' if not self.df: self._load_dataset() return self.df
solEstaciones = dfEstaciones.rdd solEpocas = dfEpocas.rdd #tenemos solMeses = (MES, GENERO), NUMVECES solMeses = solMeses.map(lambda x: ((x[1], x[0]), x[2])) solMeses = solMeses.reduceByKey(lambda a, b: a + b) #solEstaciones = (ESTACION, GENERO), NUMERO solEstaciones = solEstaciones.map(lambda x: ((x[1], x[0]), x[2])) solEstaciones = solEstaciones.reduceByKey(lambda a, b: a + b) #solEpocas = (EPOCA, GENERO), NUMERO solEpocas = solEpocas.map(lambda x: ((x[1], x[0]), x[2])) solEpocas = solEpocas.reduceByKey(lambda a, b: a + b) maximo = sc.emptyRDD() #en maximo guardaremos los top 5 generos por mes, epoca y estacion minimo = sc.emptyRDD() #en minimo guardaremos el peor genero por mes, epoca y estacion for mes in meses: rdd4 = solMeses.filter(lambda x: mes == x[0][0]) #cogemos las filas que tengan el mes que queremos rdd4 = rdd4.map(lambda x: (x[1], x[0][1])) rdd4 = rdd4.sortByKey(False) #ordenamos de mayor a menor numero de visualizaciones rdd5 = rdd4.sortByKey(True) #ordenamos de menor a mayor numero de visualizaciones if rdd4.count() > 0: rdd4 = rdd4.take(5) #cogemos los 5 mayores rdd5 = rdd5.take(1) #y el menor rdd4 = sc.parallelize(rdd4) #los pasamos a rdd rdd5 = sc.parallelize(rdd5) rdd4 = rdd4.map(lambda x: (mes, x[1], x[0]))
block = 3 else: block = 4 return (grid, block) grid_block_udf = udf(grid_block, grid_schema) schema = StructType([StructField("memsn", LongType(), True), StructField("utc", StringType(), True), StructField("meter", LongType(), True), StructField("busy", LongType(), True), StructField("acc", BooleanType(), True), StructField("grid", IntegerType(), True), StructField("block", IntegerType(), True)]) last_df = spark.createDataFrame(sc.emptyRDD(), schema) def process(time, rowRdd): print("========= %s =========" % str(time)) if rowRdd.isEmpty(): print("Rdd is empty") return tw = pendulum.timezone("Asia/Taipei") time = tw.convert(time) utc = pendulum.timezone("UTC") end = pendulum.instance(utc.convert(time)).subtract(minutes=2) start = end.subtract(minutes=3) end.set_to_string_format("%Y-%m-%dT%H:%M:%SZ") start.set_to_string_format("%Y-%m-%dT%H:%M:%SZ") taxi_df = spark.createDataFrame(rowRdd)
for a in ans: print(a) #question 7 if int(question) == 7: print("users who started a session on all hosts ") print(" + : ", end="") #empty RDD to hold intersections fu = uniqueuser(readhost(hosts[0])) for i in hosts[1:]: fu = fu.intersection(uniqueuser(readhost(i))) print(fu.collect()) #question 8 if int(question) == 8: print("users who started a session on exactly one host, with host name ") print(" + : ", end="") fu = sc.emptyRDD() for i in hosts: #map with host name fu = fu.union(uniqueuser(readhost(i)).map(lambda x: (x, i))) #repeats will have username, host1+host2+... and thus will not have single host print( fu.reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] in hosts).sortBy(lambda x: x[1]).collect()) #question 9 if int(question) == 9: print("Host Anonymization") for i in hosts: print(" + " + i + ": ") print(" . User name mapping: ", end="") us = readhost(i)
name, value = mapping[idx] # if we already have a value for this feature, skip to # the next one if name in thisfeats and thisfeats[name] != 'None': continue if (f == '1'): thisfeats[name] = value else: thisfeats[name] = 'None' for ff in featids[1:]: vtxfeats.append(thisfeats[ff]) return vtx(vtxid, *vtxfeats) # load all of the feature maps, feature files, and self features into an RDD alledges = sc.emptyRDD() for personid in peopleids: featmap_fname = "/home/zeppelin/facebook/%d.featnames" % personid feats_fname = "%d.feat" % personid this_feats_fname = "%d.egofeat" % personid # load the feature map fmap = [] with open(featmap_fname) as flines: for line in flines: fmap.append(fn_process(line)) # load the features for all the edges, and our own f_rdd = sc.textFile(feats_fname).map(lambda x: feat_process(x, -1)). \ union(sc.textFile(this_feats_fname).map(lambda x: feat_process(x, personid))) #f_rdd = sc.textFile(feats_fname).map(lambda x: feat_process(x, -1))
hdfs_client = InsecureClient(hdfs_address, user=hdfs_user) # opening training and test data files if not cluster_execution: learning_data_filename_training = 'file://' + learning_data_filename_training id_to_dataset_filename_training = 'file://' + id_to_dataset_filename_training if learning_data_filename_test: learning_data_filename_test = 'file://' + learning_data_filename_test id_to_dataset_filename_test = 'file://' + id_to_dataset_filename_test learning_data_training = sc.textFile(learning_data_filename_training + '/*').persist( StorageLevel.MEMORY_AND_DISK) id_to_dataset_training = sc.pickleFile( id_to_dataset_filename_training).persist(StorageLevel.MEMORY_AND_DISK) learning_data_test = sc.emptyRDD() id_to_dataset_test = sc.emptyRDD() if learning_data_filename_test: learning_data_test = sc.textFile(learning_data_filename_test + '/*').persist( StorageLevel.MEMORY_AND_DISK) id_to_dataset_test = sc.pickleFile( id_to_dataset_filename_test).persist(StorageLevel.MEMORY_AND_DISK) # taking first element and checking if information about joined dataset is present has_joined_data = False first = json.loads(learning_data_training.first()) if 'joined_dataset' in first: has_joined_data = True # generating learning instances for training
class Reader(): def __init__(self): self.sc = SparkContext('local', 'Stream-SQL') self.ssc = StreamingContext(self.sc, batchDuration=3) self.spark = SparkSession.builder\ .getOrCreate() self.sc.setLogLevel('ERROR') def initStream(self): self.readInput() self.ssc.start() self.ssc.awaitTermination() def inputSQLQuery(self, query): self.modQuery = '' self.dictInnerQuery = {} innerFlag = False innerCol = '' wordList = query.split(' ') wordQuery = '' for i in range(len(wordList)): word = wordList[i] # Detect opening '(' of inner query if word == '(SELECT': innerFlag = True innerCol = wordList[i - 2] if innerFlag: wordQuery += word + ' ' else: self.modQuery += word + ' ' # Detect closing ')' of table) and not AVG(col) if ')' in word and '(' not in word: replaceInner = 'Q' + str(len(self.dictInnerQuery)) self.modQuery += replaceInner + ' ' key = replaceInner value = [wordQuery, innerCol, 0] self.dictInnerQuery[key] = value innerFlag = False wordQuery = '' def readInput(self): lines = self.ssc.textFileStream('Data/Live') self.csvSchema = StructType([ StructField('col1', IntegerType()), StructField('col2', IntegerType()), StructField('col3', IntegerType()) ]) self.stateDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema) # self.stateDF.show() self.globalDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema) self.totalTime = 0.0 def row(inpStr): return Row(int(inpStr[0]), int(inpStr[1]), int(inpStr[2])) def iterateRDD(rdd): start = time.clock() data = rdd.map(lambda line: line.split(' ')).map(row) df = data.toDF(self.csvSchema) if df.count(): # print(self.stateDF.count()) curDF = df.union(self.stateDF) self.queryRDD(curDF) # Append to global DF for batch outputs # self.globalDF = df.union(self.globalDF) self.outputQuery(curDF) self.totalTime += time.clock() - start # print(str(round(self.totalTime, 2)) + 's') lines.foreachRDD(iterateRDD) def queryRDD(self, df): # df.show() df.createOrReplaceTempView('table') for key, value in self.dictInnerQuery.items(): innerQuery = value[0] sqlDF = self.spark.sql(innerQuery) sqlRes = sqlDF.first()[0] self.dictInnerQuery[key][2] = sqlRes # df.show() b = 14 addToState = [False for i in range(df.count())] for key, value in self.dictInnerQuery.items(): col = value[1] val = value[2] # print(col, val, b) tupleList = [{col: x[col]} for x in df.rdd.collect()] for i in range(len(tupleList)): row = tupleList[i] if row[col] > val - b and row[col] < val + b: addToState[i] = True # print(addToState) itr = 0 newRows = [] newStateDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema) for row in df.rdd.collect(): if addToState[itr]: newRows.append(row) itr += 1 # print(newRows) newStateDF = self.spark.createDataFrame(newRows, self.csvSchema) self.stateDF = newStateDF # newStateDF.printSchema() approxRows = newStateDF.sort('col1', ascending=False).collect() approxDF = self.spark.createDataFrame(approxRows, self.csvSchema) # approxDF.show() self.stateDF = self.spark.createDataFrame(approxDF.head(60), self.csvSchema) # self.stateDF.show() def outputQuery(self, df): curQuery = ' '.join( list( map((lambda word: str(round(self.dictInnerQuery[word][2], 2)) if word in self.dictInnerQuery else word), self.modQuery.split()))) df.createOrReplaceTempView('table') streamOut = self.spark.sql(curQuery).first()[0] print(streamOut)
lambda a, b: a + b) # oldrdd U rdd->sort->(key,amount) tmpFile = NamedTemporaryFile(delete=True) tmpFile.close() newrdd.saveAsPickleFile(tmpFile.name) open(filename, "w") #remove all logs from logfile result = newrdd.collect() return result if __name__ == '__main__': if len(sys.argv) != 2: print('Usage: app.py <logfile>') sys.exit(-1) KeyspaceName = 'syslog' TableName = 'statistics' cluster = cascl.Cluster() CreateKeySpaceAndTable(cluster, KeyspaceName, TableName) conf = SparkConf().setAppName('CountingSyslogsByHours') sc = SparkContext(conf=conf) tmpFile = NamedTemporaryFile(delete=True) tmpFile.close() sc.emptyRDD().saveAsPickleFile(tmpFile.name) statistics = SparkCalculate(sc, sys.argv[1], tmpFile=tmpFile) #writing to Cassandra SaveToDB(statistics, cluster) #printing from Cassandra printFromDb(cluster) cluster.shutdown() sc.stop()
return (node, 1 / n) if __name__ == "__main__": sc = SparkContext(appName="pagerank") lines = sc.textFile(sys.argv[1]) count = lines.count() N = 1 / count links = lines.map(lambda x: parse(x)).cache() # print(links.collect()) # fractionals = lines.map(lambda f:parse(f)).map(lambda z:rank_fractions(z)).filter(lambda fg:(fg!=None)).mapValues(lambda pr:pr*N) # fractionals = lines.map(lambda f:parse(f)).map(lambda p:initial_rank(p,count)) # ranks = sc.emptyRDD() contribs = sc.emptyRDD() # initial_ranks = links.map(lambda r: initial_rank(r, count)) frac = initial_ranks.map(lambda p: p).filter(lambda f: (f != None)) # print(frac.collect()) # print(initial_ranks.collect()) for i in range(10): fractionals = initial_ranks.map(lambda p: rank_fractions(p)).filter( lambda f: (f != None)) contribs = fractionals.reduceByKey(add) ranks = contribs.mapValues(lambda v: .15 + .85 * v) print(ranks.collect()) joined = links.join(frac) print(joined.collect())
float(15)/2 # In[96]: start_time = time.time() variants_case = sqlContext.sql("SELECT patient,chr,pos,reference,alternative,gene_symbol,zygosity FROM parquetFile "+sqlCase) patientsID_case=sorted(variants_case.map(lambda v:v[0]).distinct().collect()) if sqlControl!="NULL": variants_control= sqlContext.sql("SELECT patient,chr,pos,reference,alternative,gene_symbol,zygosity FROM parquetFile "+sqlControl) # controlMAF=float(controlMAF) else: variants_control=sc.emptyRDD() # controlMAF=0 patientsID_control=sorted(variants_control.map(lambda v:v[0]).distinct().collect()) patientsID=patientsID_case+patientsID_control patientsID_dictionnary=dict(zip(patientsID,range(len(patientsID)))) patientsID_split_index_b=sc.broadcast(len(patientsID_case)) patientsID_dictionnary_b = sc.broadcast(patientsID_dictionnary) variants=variants_control.unionAll(variants_case) variants_grouped=variants.map(createKey_VariantGene).groupByKey() controlMAF_b=sc.broadcast(controlMAF)