def _calc_model_metrics(df: DataFrame, prob_col: str, label_col: str) -> Dict[str, float]: r"""calc model metrics at max f1 given probabilities and labels Parameters ---------- df : pyspark.sql.DataFrame prob_col : str colname w/ raw probabilities of being in class 1 label_col : str Returns ------- max_metrics : dict dict with keys : 'tp', 'tn', 'fp', 'fn', 'f1', 'accuracy', 'precision', 'recall' and corresponding values as floats Raises ------ UncaughtException """ _persist_if_unpersisted(df) metrics_df = df.groupby(prob_col).pivot(label_col).count().fillna(value=0) metrics_df.persist(StorageLevel(False, True, False, False)) window = Window.orderBy(prob_col).rowsBetween(Window.unboundedPreceding, -1) metrics_df = metrics_df.withColumn('fn', F.sum(F.col(str(1))).over(window)) metrics_df = metrics_df.withColumn( 'tn', F.sum(F.col(str(0))).over(window)).fillna(value=0) metrics_df.persist(StorageLevel(False, True, False, False, 1)) all_count = df.count() pos_count = df.where(F.col(label_col) == 1).count() neg_count = all_count - pos_count metrics_df = metrics_df.withColumn('tp', pos_count - F.col('fn')) metrics_df = metrics_df.withColumn('fp', neg_count - F.col('tn')) metrics_df = metrics_df.withColumn('precision', (F.col('tp')) / (F.col('tp') + F.col('fp'))) metrics_df = metrics_df.withColumn('recall', F.col('tp') / pos_count) metrics_df = metrics_df.withColumn( 'informativeness', 2 * (F.col('precision') * F.col('recall')) / (F.col('precision') + F.col('recall'))) metrics_df.persist(StorageLevel(False, True, False, False)) max_metrics = metrics_df.where( F.col('informativeness') == metrics_df.select( F.max(F.col('informativeness'))).take(1)[0][0]).take( 1)[0].asDict() max_metrics['accuracy'] = (max_metrics['tp'] + max_metrics['tn']) / all_count max_metrics['threshold'] = max_metrics[prob_col] return max_metrics
def stage3(args): processed_path = args.wikidata_path.replace('.json', '-processed.json') processed_rdd = sc.textFile(processed_path).map(lambda s: literal_eval(s)) processed_rdd.persist(StorageLevel(True, True, False, False, 1)) ## Restore last checkpoint start_level = 0 for i in range(args.level): expanded_path = args.wikidata_path.replace( '.json.bz2', '-expanded-level{}.txt'.format(i)) if os.path.exists(expanded_path): start_level = i + 1 if start_level > 0: print( f'Starting level {start_level} > 0, restoring previous checkpoint') expanded_path = args.wikidata_path.replace( '.json.bz2', '-expanded-level{}.txt'.format(start_level - 1)) all_categories = sc.textFile(expanded_path).map(literal_eval) else: all_categories = processed_rdd\ .filter(lambda tup: tup[1]['wiki_title'] is not None)\ .map(lambda tup: (tup[0], (0, {}, tup[1]['instance_of'] + tup[1]['subclass_of'] + tup[1]['occupation']))) for i in range(start_level, args.level): next_level_categories = all_categories\ .filter(lambda tup: tup[1][-1] != [])\ .flatMap(lambda tup: [(category, tup[0]) for category in tup[1][-1]])\ .join(processed_rdd)\ .map(lambda tup: (tup[1][0], tup[1][1]['subclass_of']))\ .reduceByKey(add, 512) next_all_categories = all_categories.leftOuterJoin(next_level_categories, 512)\ .map(update_categories) next_all_categories.persist(StorageLevel(True, True, False, False, 1)) all_categories.unpersist() all_categories = next_all_categories unfinished_entities = all_categories.filter( lambda tup: tup[1][-1] != []).count() print( 'After {} iterations, there are still {} entities that needs to be expanded' .format(i + 1, unfinished_entities)) expanded_path = args.wikidata_path.replace( '.json.bz2', '-expanded-level{}.txt'.format(i)) print( 'Persisting it onto disk with file name {}'.format(expanded_path)) all_categories.saveAsTextFile( expanded_path, 'org.apache.hadoop.io.compress.BZip2Codec') if i > 0: old_expanded_path = args.wikidata_path.replace( '.json.bz2', '-expanded-level{}.txt'.format(i - 1)) print(f'Removing old directory {old_expanded_path}') rmtree(old_expanded_path) if unfinished_entities == 0: break
def word_count_plus_pesist_mem_only(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=[ 'run_app_small.py', 'run_app.py', 'sesgo_scripts.py', 'persist_scripts.py', 'repartition_scripts.py', 'config_scripts.py', 'wordCountConfig.py' ]) data = sc.textFile(data_file) words = data.flatMap(mymapeo) frequencies = words.map(lambda x: (x, 1)).reduceByKey( lambda a, b: a + b).persist( StorageLevel(True, False, False, False, int(internal_param[7]))) topFreqs = frequencies.sortBy(lambda x: x[1], ascending=False).persist( StorageLevel(True, False, False, False, int(internal_param[7]))) print('Top 5 frequencies:', topFreqs.take(5)) leastFreqs = frequencies.sortBy(lambda x: x[1], ascending=True) print('Leats 5 frequencies:', leastFreqs.take(5)) topLenFreqs = topFreqs.sortBy(lambda x: len(x[0]), ascending=False).persist( StorageLevel(True, False, False, False, int(internal_param[7]))) print('Top 5 length:', topLenFreqs.take(5)) containsAwords = words.filter(lambda x: 'a' in x) print('Number of words containing a:', containsAwords.count()) containsXwords = words.filter(lambda x: 'x' in x) print('Number of words containing x:', containsXwords.count()) app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs, n_iter_search): # Spark configuration. conf = (SparkConf().setMaster("local[" + str(n_jobs) + "]").setAppName("FDD").set( "spark.executor.memory", "512mb").set("spark.cores.max", str(n_jobs))) sc = SparkContext(conf=conf) # Build hyperparameter vectors. parameters = cartesian((n_components, n_pc, covar_types)) # Distribute the hyperparameters vector. parameters_rdd = sc.parallelize(parameters, 96) # Broadcast the data to all workers. data_broadcast = sc.broadcast(data) # Train a model for each hyperparameter set. models = parameters_rdd.map( lambda param: train_with_parameters(param, data_broadcast)) # Persist the models the avoid re-computation. models.persist(StorageLevel(True, True, False, True, 1)) # Sort by BIC. sorted_models = models.sortBy(lambda model: model[0]) # The first is the best model. best_model = sorted_models.collect()[0][1] sc.stop() return best_model
def main(input_file, output_file): sc = SparkContext('local[*]', 'srinagapavanisirichandana_pisupati_task1') textRDD = sc.textFile(input_file).repartition(2) reviews = textRDD.map(parse_review).persist(StorageLevel(True, True, False, False, 1)) review_counts_2018 = reviews.filter(lambda r: is_year(r, '2018')) distinct_users = reviews.map(lambda r: r["user_id"]).distinct() top_users = reviews.map(lambda review: (review["user_id"], 1)).reduceByKey(lambda a, b: a + b) \ .sortBy(lambda x: (-1 * x[1], x[0]),numPartitions=1).take(10) distinct_businesses = reviews.map(lambda r: r["business_id"]).distinct() top_businesses = reviews.map(lambda review: (review["business_id"], 1)).reduceByKey(lambda a, b: a + b) \ .sortBy(lambda x: (-1 * x[1], x[0]),numPartitions=1).take(10) result = {"n_review": reviews.count(), "n_review_2018": review_counts_2018.count(), "n_user": distinct_users.count(), "top10_user": top_users, "n_business": distinct_businesses.count(), "top10_business": top_businesses } fp = open(output_file, "w") fp.write(json.dumps(result)) fp.close()
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1]))) sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0]))) ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() featuresCV = featureFitModel.transform(ngramFrame) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(ngramFrame,path) testData.show()
def train(allHex,labels,hashFiles,sc,sqlc,path): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000) featureFitModel = cv.fit(ngramFrame) featuresCV = featureFitModel.transform(ngramFrame) labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0])) labelFrame = labelRdd.toDF(["did","label"]) trainData = ngramFrame.featuresCV(labelFrame,"did") trainData.persist(StorageLevel(True, True, False, False, 1)) saveData(trainData,path) trainData.show() returm featureFitModel
def word_count_persist_disk_only(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=[ 'run_app_small.py', 'run_app.py', 'sesgo_scripts.py', 'persist_scripts.py', 'repartition_scripts.py', 'config_scripts.py', 'wordCountConfig.py' ]) data = sc.textFile(data_file) words = data.flatMap(mymapeo).persist( StorageLevel(False, True, False, False, int(internal_param[7]))) frequencies = words.map(lambda x: (x, 1)).reduceByKey( lambda a, b: a + b) frequencies.collect() print(frequencies.take(5)) app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def main(): # Logistic regression or Random forest output_type = sys.argv[1] raw_file_path = sys.argv[2] path_to_output = sys.argv[3] raw_input_rdd = sc.textFile( raw_file_path, minPartitions=32).map(lambda line: line.encode("utf-8")) process_data = raw_input_rdd.map(lambda line: replacetab(line)) df_for_pp = create_df(process_data) pp = PreProcess(df_for_pp) preprocessed_data = pp.preprocess_data() data = None if output_type == "rf": data = prep_rf(preprocessed_data) # write to file the data variable data.persist(StorageLevel(True, True, False, False, 1)) data.write.parquet(path_to_output + "final_" + output_type + "_data.parquet")
def main(): # Initialize Spark configuration = SparkConf().setAppName("1000-genomes Project") spark_context = SparkContext(conf=configuration) container_name = "1000-genomes-dataset" config = {'user':os.environ['OS_USERNAME'], 'key':os.environ['OS_PASSWORD'], 'tenant_name':os.environ['OS_TENANT_NAME'], 'authurl':os.environ['OS_AUTH_URL']} # Connect to Object-Storage to retrieve filenames conn = swiftclient.client.Connection(auth_version=3, **config) (storage_url, auth_token) = conn.get_auth() (response, content) = swiftclient.client.get_container(url=storage_url,container=container_name, token=auth_token) num_files_to_include = 3 names = filter(lambda t: t['name'][-4:] == '.bam', content) filelist = [{"name" : c['name'].strip(), "hash" : c['hash'].strip()} for c in names[:num_files_to_include]] numpart = len(filelist)/2 # Distribute the processing of the files filenames = spark_context.parallelize(filelist, numpart) mapped_data = filenames.flatMap(process).persist(StorageLevel(True, True, False, True, 1)) # Filter the different results returned from the processing start_time_filtering = time.time() kmers = mapped_data.filter(lambda (k, (v, e)): k == "KMER").map(lambda (k, v): v).reduceByKey(add,numPartitions=5) positions = mapped_data.filter(lambda (k,v): k == "POSITION").map(lambda (k, v): v).reduceByKey(add,numPartitions=5) time_filtering = time.time() - start_time_filtering time_mapping = mapped_data.filter(lambda (k,v): k == "TIME-MAPPING").map(lambda (k, v): v).reduceByKey(add,numPartitions=5) time_download = mapped_data.filter(lambda (k,v): k == "TIME-DOWNLOAD").map(lambda (k, v): v).reduceByKey(add,numPartitions=5) time_mapping = time_mapping.collect() time_download = time_download.collect() # Write results to files timing_file = open("timing.txt", "w") timing_file.write("Mapping " + str(time_mapping) + "\n") timing_file.write("Mapping+Downloading " + str(time_download) + "\n") timing_file.write("Filtering " + str(time_filtering) + "\n") timing_file.close() kmer_file = open("kmers.txt", "w") for item in kmers.collect(): print>>kmer_file, item kmer_file.close() pos_file = open("positions.txt", "w") for item in positions.collect(): print>>pos_file, item pos_file.close() for obj in [open(f, "r") for f in ["kmers.txt", "positions.txt"]]: swiftclient.client.put_object(url=storage_url, token=auth_token, container="result", name="group_11/" + str(start_time_filtering) + "_" + obj.name, contents=obj) obj.close() return
def __init__(self, outfile): self.spark = SparkSession.builder.appName( "sparkUserAndItemData").enableHiveSupport().getOrCreate() # self.spark.conf.set("spark.sql.execution.arrow.enabled", "true") # self.spark.conf.set("spark.sql.crossJoin.enabled", "true") self.out_files = outfile self.age_dict = {i: str(i + 1) for i in range(100)} self.workid_dict = {i: str(i + 1) for i in range(100)} self.sex_dict = {'f': '1', 'm': '2', '-1': '0'} self.height_dict = {i: str(i - 99) for i in range(100, 227)} # 添加有效类别的字典 # self.user_data = self.spark.sql(UserSql).persist(StorageLevel(True, True, False, False, 1)) # DISK_ONLY = StorageLevel(True, False, False, False, 1) # DISK_AND_ME self.user_data, self.item_data = self.feature_convert_id() self.user_data.persist(StorageLevel(True, True, False, False, 1)) self.item_data.persist(StorageLevel(True, False, False, False, 1))
def main(input_file_review, input_file_business, output_file_a, output_file_b): sc = SparkContext('local[*]', 'srinagapavanisirichandana_pisupati_task3') review_rdd = sc.textFile(input_file_review).repartition(2) business_rdd = sc.textFile(input_file_business).repartition(2) parsed_reviews = review_rdd.map(parse_review) parsed_businesses = business_rdd.map(parse_business) city_stars = parsed_businesses.leftOuterJoin(parsed_reviews).map(lambda x: x[1]) \ .filter(lambda x: x[1] is not None) \ .combineByKey( lambda v: (v, 1) , lambda x, c: (x[0] + c, x[1] + 1) , lambda x, y: (x[0] + y[0], x[1] + y[1])) \ .map(lambda bac: (bac[0], round(bac[1][0] / bac[1][1], 1))) \ .sortBy(lambda x: (-1 * x[1], x[0].lower()), numPartitions=1) \ .map(lambda b: b[0] + ',' + str(b[1])).persist(StorageLevel(True, True, False, False, 1)) fp = open(output_file_a, "w") fp.write('city,stars\n') ress = city_stars.collect() # city_stars.map(lambda b: b[0] + ',' + str(b[1])).saveAsTextFile(output_file_a) for r in ress: fp.write(r + '\n') fp.close() collect_init_seconds = time.time() print('city,stars') collect_stars = (city_stars.collect())[:10] for r in collect_stars: print(r) m1 = time.time() - collect_init_seconds take_init_seconds = time.time() print('city,stars') take_stars = city_stars.take(10) for r in take_stars: print(r) m2 = time.time() - take_init_seconds if m1 > m2: explanation = "Method 2 i.e. taking the first 10 cities is faster than Method 1" else: explanation = "Method 1 i.e.collecting all the data and printing the first 10 cities is faster than Method 2" result = { "m1": round(m1, 1), "m2": round(m2, 1), "explanation": explanation } fp = open(output_file_b, "w") fp.write(json.dumps(result)) fp.close()
def word_count_sort_pesist_mem_and_disk_ser(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=[ 'run_app_small.py', 'run_app.py', 'sesgo_scripts.py', 'persist_scripts.py', 'repartition_scripts.py', 'config_scripts.py', 'wordCountConfig.py' ]) data = sc.textFile(data_file) words = data.flatMap(mymapeo).persist( StorageLevel(True, True, False, False, int(internal_param[7]))) frequencies = words.map(lambda x: (x, 1)).reduceByKey( lambda a, b: a + b).persist( StorageLevel(True, True, False, False, int(internal_param[7]))) numWords = data.count() sortFreq = frequencies.sortBy(lambda x: x[1], ascending=False) topFreqs = sortFreq.take(5) print('Number of words: ', numWords) print('Words frequencies:', sortFreq.collect()) print('Top 5 frequencies:', topFreqs) app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel): asmFiles = hashFiles.map( lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm") def fun(accum, x): return accum + ',' + x asmFileString = asmFiles.reduce(fun) rdd1 = sc.wholeTextFiles(asmFileString, 20) opcodesInDoc = rdd1.map(lambda x: x[1].split()).map( lambda x: [word for word in x if word in opcodes.value]).zipWithIndex( ).map(lambda x: (x[1], x[0])) ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"]) twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams") ngramFrame = threeGram.transform(ngramFrame) fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams") ngramFrame = fourGram.transform(ngramFrame) def getSegment(x): templist = [] for line in x: l = re.findall(r'\w+:?(?=:)', line) if l: templist.append(l[0]) return templist segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines( ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"]) featureFrame = ngramFrame.join(segments, "docId") featuresDF = featureFrame.rdd.map( lambda x: Row(did=x['docId'], docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[ '4grams'] + x['segments'])).toDF() featuresCV = featureFitModel.transform(featuresDF) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(testData, path) testData.show()
def analyse(path): # read file df = spark.read.option('header', True) \ .option('inferSchema', True) \ .csv(path) \ .persist(StorageLevel(True, True, False, False)) df.createOrReplaceTempView("survey_results") # functions get_developer_os() get_developer_type() get_contrib_open_source(df) get_coding_as_hobby(df)
def generate_fake_news(data, num_entries, min_length, max_length): from pyspark import StorageLevel modd_data = data.map(lambda x: x + " SENTENCE_END") mrkv_pairs = modd_data.map(lambda x: x.split())\ .map(lambda x: [x[0].capitalize()] + x[1:])\ .flatMap(lambda x: [(x[i], x[i + 1]) for i in range(len(x) - 1)])\ .cache() revrsed_mrkv_pairs = mrkv_pairs.map(lambda x: (x[1], x[0]))\ .partitionBy(data.getNumPartitions())\ .cache() cur_matrix = mrkv_pairs.persist(StorageLevel(True, True, False, False, 1)) cur_matrix = cur_matrix.map(lambda x: (x[0], (x[1], [x[0], x[1]]|))).persist() for i in range(max_length): cur_matrix = revrsed_mrkv_pairs.join(revrsed_mrkv_pairs)\ .map(lambda x: (x[1][0], (x[0], x[1][0] + x[1][1][1]))).persist() print(cur_matrix.take(1)) news_entries = [] for _ in range(num_entries): first_element = mrkv_pairs.filter(lambda x: x[0] != "SENTENCE_END" and \ x[1] != "SENTENCE_END" and x[0][0].isupper())\ .takeSample(False, 1)[0] fake_news = first_element[0] cur_wrd = first_element[1] for i in range(max_length): cur_pool = mrkv_pairs.filter(lambda x: x[0] == cur_wrd).cache() if i < min_length: cur_pool = cur_pool.filter(lambda x: x[1] != "SENTENCE_END").cache() if cur_pool.count() == 0: i = 0 fake_news = first_element[0] cur_wrd = first_element[1] continue cur_pair = cur_pool.takeSample(False, 1)[0] fake_news += (" " + cur_pair[0]) cur_wrd = cur_pair[1] if cur_wrd == "SENTENCE_END": fake_news += "." break news_entries.append(fake_news) return news_entries
def main(train_file, model_file, stopwords_file): SparkContext.setSystemProperty('spark.executor.memory', '4g') SparkContext.setSystemProperty('spark.driver.memory', '4g') sc = SparkContext.getOrCreate() start = time.time() stopwords = {s for s in sc.textFile(stopwords_file).collect()} reviews = (sc.textFile(train_file).map( json.loads).map(lambda d: tokenize(d, stopwords)).persist( StorageLevel(True, True, False, False))) n = reviews.count() #calculating number of documents the term appears in dfs = (reviews.flatMap(lambda d: d["tokens"]).map( lambda t: (t, 1)).reduceByKey(add).collectAsMap()) idfs = {k: math.log(n / v) for k, v in dfs.items()} def add_key_prefix(rdd, prefix): return rdd.map(lambda x: ("{}_{}".format(prefix, x[0]), x[1])) business_profiles = build_profile(reviews, "business_id", idfs) user_profiles = (reviews.map(lambda d: (d["business_id"], d["user_id"])). join(business_profiles).values().aggregateByKey({}, merge, merge)) add_key_prefix(user_profiles, "u").union( add_key_prefix(business_profiles, "b")).map(json.dumps).saveAsTextFile('task2_model') with open(model_file, 'wb') as outfile: for filename in glob.glob('task2_model/part*'): with open(filename, 'rb') as readfile: shutil.copyfileobj(readfile, outfile) shutil.rmtree('task2_model') print("Duration:", time.time() - start)
def word_count_sort_pesist_mem_only(internal_param, data_file): try: conf = SparkConf().setMaster("spark://dana:7077").setAppName( internal_param[1]).setAll([ ('spark.driver.cores', internal_param[2]), ('spark.driver.memory', internal_param[3]), ('spark.executor.instances', internal_param[4]), ('spark.executor.memory', internal_param[5]), ('spark.executor.cores', internal_param[6]) ]) sc = SparkContext(conf=conf, pyFiles=['run_app.py', 'config_scriptsDf.py']) spark = SparkSession.builder.config(conf=conf).getOrCreate() data = sc.textFile(data_file).flatMap(lambda x: x.split(" ")).collect() paralData = sc.parallelize(data, 200) df = paralData.map(lambda r: Row(r)).toDF(["word"]) cleanDf = df.filter(col('word') != '').withColumn( 'word', regexp_replace(col('word'), '[^\sa-zA-Z0-9]', '')).persist( StorageLevel(True, False, False, False, int(internal_param[7]))) freqDf = cleanDf.withColumn( 'count', lit(1)).groupBy('word').sum('count').withColumnRenamed( 'sum(count)', 'frequencies') topFreqsDf = freqDf.orderBy('frequencies').limit(5) print('Number of words: ', cleanDf.count()) print('Top 5 frequencies:') topFreqsDf.show() app_id = sc.applicationId sc.stop() return app_id except: print("Configuration error: " + str(internal_param)) sc.stop()
def main(): spark = SparkSession \ .builder \ .appName("RandomForest") \ .config("spark.executor.heartbeatInterval","60s")\ .getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) sc.setLogLevel("INFO") train_df = spark.read.parquet(sys.argv[1]) #Persist the data in memory and disk train_df.persist(StorageLevel(True, True, False, False, 1)) rfc = RandomForestClassifier(maxDepth=8, maxBins=2400000, numTrees=128, impurity="gini") rfc_model = rfc.fit(train_df) rfc_model.save(sys.argv[2] + "rfc_model")
pan = None print(time.clock()-start) # I converted csv to a parquet file to save space and time def lis(x): return [float(i) for i in x[1:-1].split(',')] spark.read.load("train_fet.csv", format="csv", inferSchema="true", header="true").rdd \ .map(lambda x: (x[2], x[1], DenseVector(lis(x[0])))) \ .toDF(["index", "file", "features"]) .write.parquet("train_fet.parquet") # Now I create the Bag of Visual Words representation using K-means schema = spark.read.parquet("train_fet.parquet").persist(StorageLevel(True, True, False, False, 1)) start = time.clock() kmeans = KMeans(k=K, initMode='random') print(time.clock()-start) start = time.clock() model = kmeans.fit(schema) print(time.clock()-start) start = time.clock() centers = model.clusterCenters() print(time.clock()-start) model.save('KmeansModel') # Next I create the Hamming Embedding Matrix G = np.random.randn(db, d) P, _ = np.linalg.qr(G)
def _eval_df_model( df: DataFrame, prob_mod: pyspark.ml.Model, sample_size: Optional[int] = 10**5, additional_metrics: bool = False ) -> propensity_model_performance_summary: r"""calculate binary classification model metrics on provided dataframe Calculate accuracy, precision, and recall at maximum value for informativeness (f1) also provide auc and auprc(precision-recall curve) Parameters ---------- df : pyspark.sql.DataFrame Array_like means all those objects -- lists, nested lists, etc. -- that can be converted to an array. We can also refer to variables like `var1`. prob_mod : pyspark.ml.Model The type above can either refer to an actual Python type (e.g. ``int``), or describe the type of the variable in more detail, e.g. ``(N,) ndarray`` or ``array_like``. sample_size additional_metrics Returns ------- prob_mod_perf_sum : propensity_model_performance_summary Other Parameters ---------------- sample_size: int max sample size to calculate performance. Defaults to 10**6, can be left as None to avoid sampling Notes ----- propensity_model_performance_summary : namedtuple 'auc' : float 'auprc' : float area under precision recall curve 'threshold' : float 'informativeness' (f1) : float 'precision' : float 'recall' : float 'accuracy' : float Raises ------ UncaughtExceptions See Also -------- _calc_auc_auprc _calc_model_metrics """ sample_df = _sample_df(df=df, sample_size=sample_size) _persist_if_unpersisted(sample_df) label_col = prob_mod.getOrDefault('labelCol') prob_col = prob_mod.getOrDefault('probabilityCol') features_col = prob_mod.getOrDefault('featuresCol') sample_df = prob_mod.transform(sample_df.select(features_col, label_col)) prob_1_col = "{prob_col}_1".format(prob_col=prob_col) sample_df = sample_df.withColumn( prob_1_col, F.udf(lambda x: float(x[1]), T.DoubleType())(F.col(prob_col))) sample_df.persist(StorageLevel(False, True, False, False, 1)) auc, auprc = _calc_auc_auprc(df=sample_df, prob_col=prob_1_col, label_col=label_col) metric_keys = [ 'threshold', 'informativeness', 'precision', 'recall', 'accuracy' ] if additional_metrics: metrics_dict = _calc_model_metrics(df=sample_df, prob_col=prob_1_col, label_col=label_col) metrics_dict = { x: metrics_dict[x] for x in metrics_dict.keys() if x in metric_keys } else: metrics_dict = {x: None for x in metric_keys} prob_mod_per_sum = propensity_model_performance_summary(auc=auc, auprc=auprc, **metrics_dict) return prob_mod_per_sum
shingles = [] # shingle = list(image[:k, :k]) for i in range(image.shape[0] - k): for j in range(image.shape[1] - k): red_img = image[i:i + k, j:j + k, :].flatten() shingles.append(tuple(red_img)) # , :].flatten())) # shingles.append(tuple(shingle)) return (name, shingles) rdd2 = rdd1.map(map1) from pyspark import StorageLevel DISK_ONLY = StorageLevel(True, False, False, False, 1) rdd2.persist(DISK_ONLY) rdd3 = rdd2.flatMap(lambda r: r[1]) k_shingle_space = rdd3.distinct().collect() N = len(k_shingle_space) print(N) def map2(tuple1): name, image = tuple1 #bool_vec = bitarray(N)
import time # Slight modification to the baseline approach where a base layer is pre-materialized and # instead of starting the CNN inference all the way from raw images start from the materialized # base layer. if __name__ == '__main__': ############################change appropriately################################### model = 'alexnet' pre_mat_layer_index = -4 # from the top explore_layer_index = -1 # from the top struct_input = 'hdfs://spark-cluster-master:9000/foods.csv' pre_mat_input = 'hdfs://spark-cluster-master:9000/' + model + "_pre_mat_layer.parquet" heap_memory = 29 num_executors = 1 executor_cpu = 5 storage_level = StorageLevel(True, True, False, True) # memory and disk deserialized sp_core_memory_fraction = 0.6 ################################################################################### prev_time = time.time() if model == 'vgg16': initial_shape = VGG16.transfer_layers_shapes[pre_mat_layer_index] elif model == 'resnet50': initial_shape = ResNet50.transfer_layers_shapes[pre_mat_layer_index] elif model == 'alexnet': initial_shape = AlexNet.transfer_layers_shapes[pre_mat_layer_index] conf = SparkConf() app_name = 'pre-mat-' + model + "-l:" + str(pre_mat_layer_index) conf.setAppName(app_name) conf.set("spark.executor.memory", str(heap_memory)+"g") conf.set("spark.memory.fraction", sp_core_memory_fraction)
def create_context(host, port): """ Function to create and set up a new streaming context. :param host: A string describing the IP to use for HTTP Client. Example: 'localhost' :param port: An integer value that corresponds to the port number used to read data from. :return: The created and configured streaming context. """ spark_context = SparkContext(master="local[2]", appName="TwitterStreamApp") spark_context.setLogLevel("ERROR") # create the Streaming Context from the above spark context with interval size 2 seconds streaming_context = StreamingContext(spark_context, 2) # setting a checkpoint to allow RDD recovery streaming_context.checkpoint(CHECKPOINT) # read data from port 9009 data_stream = streaming_context.socketTextStream(ADDRESS, PORT, storageLevel=StorageLevel( True, True, False, False, 2)) # split each tweet into words words = data_stream.flatMap(lambda line: line.split(" ")) # filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1) hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1)) # adding the count of each hashtag to its last count tags_totals = hashtags.updateStateByKey(aggregate_tags_count) def process_rdd(time, rdd): """ Function that processes an RDD. :param time: Time stamp of the process. :param rdd: The RDD to be processed. """ print("----------- %s -----------" % str(time)) try: if rdd: # Get spark sql singleton context from the current context sql_context = get_sql_context_instance(rdd.context.getConf()) # convert the RDD to Row RDD row_rdd = rdd.map( lambda w: Row(hashtag=w[0], hashtag_count=w[1])) # create a DF from the Row RDD hashtags_dataframe = sql_context.createDataFrame(row_rdd) # Register the dataframe as table hashtags_dataframe.createOrReplaceTempView("hashtags") # get the top 10 hashtags from the table using SQL and print them hashtag_counts_dataframe = sql_context.sql( "select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10" ) hashtag_counts_dataframe.show() # call this method to prepare top 10 hashtags DF and send them def send_dataframe_to_dashboard(dataframe): """ Function to send DataFrame to the dashboard for visualization. :param dataframe: Spark DataFrame created by process_rdd(). """ # extract the hashtags from dataframe and convert them into array top_tags = [ str(t.hashtag) for t in dataframe.select("hashtag").collect() ] # extract the counts from dataframe and convert them into array tags_count = [ p.hashtag_count for p in dataframe.select("hashtag_count").collect() ] # initialize and send the data through REST API request_data = { 'label': str(top_tags), 'data': str(tags_count) } response = post(dashboard_url, data=request_data) send_dataframe_to_dashboard(hashtag_counts_dataframe) except: pass # do processing for each RDD generated in each interval tags_totals.foreachRDD(process_rdd) return streaming_context
# Set spark configuration and spark context conf = SparkConf().setAppName("part3").setMaster("local") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # Read the input file as RDD lines = sc.textFile(input_file) # Read the input file as RDD # Here we followed the RDD paper implementation closely # we first filter the data with "#", then split each row to get keys and values. # Then we did the persistence (cache setting) for the data. lines = sc.textFile(input_file).filter(lambda line: '#' not in line).map( lambda line: line.split("\t", 1)).persist( storageLevel=StorageLevel(False, True, False, False, 1)) # group by key operation node_map = lines.groupByKey().partitionBy(partition_num) # generate initial rank rank = node_map.mapValues(lambda e: 1.0).partitionBy(partition_num) # Run ten iterations with each recalculating a rank for every node based on PageRank # algorithm # Contribution is calculated following https://en.wikipedia.org/wiki/PageRank and original RDD paper for i in range(10): contribution = node_map.join(rank).flatMap(lambda rn: [ (to_id, rn[1][1] / rn[1][0].__len__()) for to_id in rn[1][0] ]).reduceByKey(lambda rn1, rn2: rn1 + rn2) rank = contribution.mapValues(lambda cr: 0.15 + 0.85 * cr) break
else: return 0,0,0 rdd31 = rdd30.mapValues(stat_overdue) # In[ ]: rdd32 = rdd31.map(lambda x:) # In[10]: rdd31.persist(StorageLevel(False,True,True,False,)) # In[ ]: rdd31 # In[11]: hdfs_workdir # In[11]:
#lines = sc.textFile(folder).map(lambda l: list(imap(string.strip, l.split(';')))).cache # if lines.count() % 7 != 0: # print "missing complete weeks in dataset" # #exit() r = lines.filter(lambda x: municipio(x[9])) \ .filter(lambda x: validate(x[3])) \ .filter (lambda x: cavallo_week(x[3])) \ .map(lambda x: ((x[1], cell2municipi[x[9]] , week_month(x[3]), is_we(x[3]) , day_of_week(x[3]) , day_time(x[4])), 1)) \ .distinct() \ .map(lambda x: ((x[0][:4] + (x[0][5],)), 1)) \ .reduceByKey(lambda x, y: x + y) \ .map(lambda x: (x[0][0], [x[0][1:] + (x[1],), ])) \ .reduceByKey(lambda x, y: x + y) \ .persist(StorageLevel(False, True, False, False)) ### # Carrello format: user -> [(municipio, settimana, weekend/workday, time_slice, count),...] # nota: count= day of presence in the region at the timeslice # week ordering # keys: region,busiest week,workday/we,timeslice r = r.map(lambda x: (x[0], sorted(x[1], key=lambda w: (w[0], sum([z[4] for z in x[1] if z[1] == w[1]]), -w[2], w[3]), reverse=True))) r = r.map(lambda x: (x[0], normalize(x[1])))
def CalculateUser(line): station_val_map = station_broad.value imsi = line[0].split("#")[0] station = line[0].split("#")[1] visit_num = line[1] return (imsi, visit_num * station_val_map[station]) def list2map(l, n): m = {} for e in l: m[e[0]] = e[1] * 1.0 / n * 100 return m user_station_num = sqlContext.read.json("/input/zp/user_traj_403.json").rdd.flatMap(userStationNum)\ .reduceByKey(lambda x, y: x + y).persist(StorageLevel(False, True, False, False, 1)) user_val = user_station_num.map(lambda line: (line[0].split("#")[0], 100))\ .reduceByKey(lambda x, y : x).collect() user_val_map = list2map(user_val, 100) user_broad = sc.broadcast(user_val_map) station_val = user_station_num.map(lambda line: (line[0].split("#")[1], 100))\ .reduceByKey(lambda x, y : x).collect() station_val_map = list2map(station_val, 100) station_broad = sc.broadcast(station_val_map) for i in range(30): station_val = user_station_num.map(CalculateStation).reduceByKey(lambda x, y: x + y).collect() station_val_max = 0 for v in station_val: if v[1] > station_val_max:
def main(filter_threshold, support, input_file, output_file): sc = SparkContext('local[*]', 'srinagapavanisirichandana_pisupati_task2') sc.setLogLevel("OFF") counts = {} def freq_product(a, b, k): # aka join res = [] joined = map(lambda x: tuple(sorted(x)), itertools.combinations(dict.fromkeys(itertools.chain.from_iterable(a)), k)) for r in joined: if all_pairs_frequent(r, k): res += [r] return res def all_pairs_frequent(r, i): if i < 3: return True for x in itertools.combinations(r, i - 1): if filter_combinations(x): return False return True def filter_combinations(item_combination): cs = counts#.value if item_combination in cs: return cs[item_combination] < support else: return True init_time = time.time() textRDD = sc.textFile(input_file) baskets_items = textRDD.filter(lambda l: l != "user_id,business_id").map( lambda l: l.split(',')).map(lambda x: (x[0], x[1])).groupByKey().map( lambda x: dict.fromkeys(x[1])).filter(lambda x: len(x) > filter_threshold).persist( StorageLevel(True, True, False, False, 1)) basket_items_with_sizes = baskets_items.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) C = {1: sorted(basket_items_with_sizes.map(lambda x: (x[0],)).toLocalIterator())} freq_set = sorted(basket_items_with_sizes.filter(lambda x: x[1] >= support).map(lambda x: (x[0],)).toLocalIterator()) L = {1: freq_set} k = 1 def cmap(item_set, cs): cnts = {} for c in cs: cnts[c] = 0 if all(elem in item_set for elem in c): cnts[c] += 1 return cnts while len(L[k]) > 1: k += 1 C[k] = freq_product(L[k - 1], L[k - 1], k) counts.update(baskets_items.map(lambda itemset: cmap(itemset, C[k])).fold({},add_dicts)) L[k] = list(itertools.filterfalse(filter_combinations, C[k])) fp = open(output_file, "w") fp.write("Candidates:\n") for c in C: if C[c]: if (c == 1): print(*["('"+elem[0]+"')" for elem in C[c]], sep=",", file=fp) else: print(*(sorted(C[c])), sep=",", file=fp) fp.write("\nFrequent Itemsets:\n") for l in L: if L[l]: if (l == 1): print(*["('"+elem[0]+"')" for elem in L[l]], sep=",", file=fp) else: print(*(sorted(L[l])), sep=",", file=fp) fp.close() final_time = time.time() - init_time print("Duration:", final_time, "seconds")
total_score += PRIORPOLARITY[lexicons[w]['priorpolarity']] * TYPE[ lexicons[w]['type']] return total_score # Make sure Python uses UTF-8 as tweets contains emoticon and unicode reload(sys) sys.setdefaultencoding('utf-8') # Use SQLContext for better support sqlContext = SQLContext(sc) # Define storage level DISK_ONLY_2 = StorageLevel(True, False, False, False, 2) MEMORY_AND_DISK = StorageLevel(True, True, False, False, 1) # Read GNIP's JSON file directory = "tweets" datasets = sqlContext.read.json(directory) log('# Completed reading JSON files') # Check checksum count file_count = datasets.where(datasets['verb'].isNull()).count() expect('file_count', file_count, 21888) # Check post count all_posts = datasets.where(datasets['verb'] == 'post') all_posts_count = all_posts.count() expect('all_posts_count', all_posts_count, 1570398)