def recom(matrix_file_name, user_file_name, output="re.out"): sc = SparkContext("local[8]", "Recommendation") """ Reads in a sequence file FILE_NAME to be manipulated """ matrix = sc.sequenceFile(matrix_file_name) user = sc.sequenceFile(user_file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ user_tuples = user.flatMap(flat_user) \ .map(map_user) \ .sortByKey(keyfunc=lambda k: int(k)) keys = user_tuples.keys().collect() matrix_tuples = matrix.flatMap(flat_matrix) \ .map(map_matrix) \ .filter(lambda x: x[0] in keys) global mt mt = matrix_tuples.collectAsMap() recm = user_tuples.flatMap(flat_recom) \ .reduceByKey(reduce_recom) \ .filter(lambda x: x[0] not in keys) \ .sortBy(lambda (key, value): int(value)) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ recm.coalesce(1).saveAsTextFile(output)
def do_process(args): sc = SparkContext(appName="task") wikipedia_rdd = sc.sequenceFile(args.wikipedia) wikidata_rdd = sc.sequenceFile(args.wikidata) result = wikidata_rdd\ .join(wikipedia_rdd)\ .map(lambda x: map_by_qid(x, args.lang))\ .groupByKey()\ .mapValues(list)\ .flatMap(map_filter_unique) result.repartition(1).saveAsTextFile(args.output)
def user_artist_matrix(file_name, output="user_artist_matrix.out"): sc = SparkContext("local[8]", "UserArtistMatrix") """ Reads in a sequence file FILE_NAME to be manipulated """ file = sc.sequenceFile(file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ ua_matrix = file.flatMap(ua_flat_doc) \ .map(ua_map) \ .reduceByKey(ua_reduce) \ .sortByKey(keyfunc=lambda k: int(k)) ua_matrix = ua_matrix.flatMap(ua_flat_vec) global avg_matrix avg_matrix = ua_matrix.reduceByKey(ua_reduce_vec) \ .map(ua_map_avg) avg_matrix = avg_matrix.collectAsMap() co_matrix = ua_matrix.map(ua_map_cmp) \ .reduceByKey(ua_reduce_cmp) \ .map(ua_map_cmp_final) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ co_matrix.coalesce(1).saveAsTextFile(output)
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True) parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False) parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true') args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagCounts = data.values().flatMap(getTokens).countByValue() # So far, this code isn't useful. The output fiile is written by the # master node into an isolated folder, and I don't know of a way to # retrieve it. if args.output != None: with codecs.open(args.output, 'wb', 'utf-8') as f: for k in sorted(tagCounts): f.write(k + " " + str(tagCounts[k]) + "\n") print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value if args.printToLog: for k in sorted(tagCounts): print json.dumps(k), tagCounts[k] print "========================================"
def main(argv): inputSequenceDir = "" outputSequenceDir = "" try: opts, args = getopt.getopt(argv, "i:o:") except getopt.GetoptError: sys.exit(2) for (opt, arg) in opts: if opt == '-i': inputSequenceDir = arg elif opt == '-o': outputSequenceDir = arg sc = SparkContext(appName="Fix XML App") datarawRDD = sc.sequenceFile(inputSequenceDir) cleanedRDD = datarawRDD.map(lambda x: trim(x)) outputFormatClassName = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" conf1 = { "mapreduce.output.fileoutputformat.compress": "true", "mapreduce.output.fileoutputformat.compress.codec": "org.apache.hadoop.io.compress.DefaultCodec", "mapreduce.output.fileoutputformat.compress.type": "RECORD" } cleanedRDD.saveAsNewAPIHadoopFile(outputSequenceDir, outputFormatClassName, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text", None, None, conf1) print "OK Bye Bye"
def wordcount(file_name, output="spark-wc-out-wordcount"): """ Reads in a sequence file FILE_NAME to be manipulated """ # We'll be using this to call Spark built-in functions! sc = SparkContext("local[8]", "WordCount") # This gets us an RDD object, on which we can apply the Spark # built-in functions! In particular, file is an RDD object # which contains all the documents specified by the sequence file # passed into this function. # What do I mean by "contains all the documents"? If you # import this file from the interpreter (pyspark), and you ran # this line of code, in addition to file.take(n), you would # get the text of the first n documents specified by the sequence file input. file = sc.sequenceFile(file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items. IMPORTANT: It then applies that function to every one of its elements, and condenses them into a single RDD. In our case, that means it will apply our flat_map function to every DOCUMENT in file, and then combine the results into a SINGLE RDD. - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ counts = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ counts.coalesce(1).saveAsTextFile(output)
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) noPublisherRecords = sc.accumulator(0) noPublisherNameRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") keyCounts = data.values().flatMap(getKeys).countByValue() print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "noPublisherRecords = %d" % noPublisherRecords.value print "noPublisherNameRecords = %d" % noPublisherNameRecords.value for k in sorted(keyCounts): print k, keyCounts[k] print "========================================" sc.stop()
def wordCount(file_name, output="spark-wc-out-nonarticleCount"): sc = SparkContext("local[8]", "WordCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) """ Reads in a sequence file FILE_NAME to be manipulated """ file = sc.sequenceFile(file_name) """ - Explanation: - - `flatMap` takes in a function that will take one input and outputs 0 or - more items. All returned results are combined into a single list of - items that future functions are run on. We use this function to - transform our document into a list of words. - - `map` takes in a function take in one item, perform an action on it, and - return the result. When called on a list, it applies the function to - each item in the list. We use this function transform our words into - `(key, value)` pairs, with the key being the word and the value being - the number of times it occurs. - - `reduceByKey` groups a list of `(key, value)` pairs by keys and runs a - function on each key which takes two values and returns a single value - (i.e. "reducing" them two inputs into one). It will be called - iteratively on each key until only a single value remains for that key. - We use this function to sum the number of times a word occurs. """ counts = file.flatMap(splitDocument) \ .map(toPairs) \ .reduceByKey(sumCounts) \ .sortByKey() """ Takes the dataset stored in counts and writes everything out to OUTPUT """ counts.coalesce(1).saveAsTextFile(output)
def index(file_name, output="spark-wc-out-index"): sc = SparkContext("local[8]", "Index") file = sc.sequenceFile(file_name) indices = file.flatMap(flat_map) \ .reduceByKey(reduce) indices.coalesce(1).saveAsTextFile(output)
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagTokenCounts = data.values().flatMap(getTokens).countByValue() sc.stop() print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "========================================" # Restructure the data, grouping by tag (token type indicator): tagTokenLists = {} for tagToken in tagTokenCounts.keys(): (tag, tokenValue) = tagToken.split(":", 1) count = tagTokenCounts[tagToken] if tag not in tagTokenLists: tagTokenLists[tag] = [] tagTokenLists[tag].append(Token(tokenValue, count)) # Process each tag seperately: for tag in tagTokenLists.keys(): tokenList = tagTokenLists[tag] # Sort the tokens by descending count and ascending token value: sortedTokenList = sorted(tokenList, key=attrgetter("value")) sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True) # Calculate the cumulative token count for each token in sorted order: totalTokens = 0 for token in sortedTokenList: totalTokens += token.count token.cumulativeCount = totalTokens # We'll use the final total later, but we need it as a float to ensure # floating point division is used: floatTotalTokens = float(totalTokens) # Print the sorted tokens with cumulative counts, fraction of # total (cunumative distribution function), and index # (enumerate the tokens per tag, starting with 1). print "========================================" tokenIndex = 0 for token in sortedTokenList: tokenIndex += 1 fractionOfTotal = token.cumulativeCount / floatTotalTokens print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value), token.count, token.cumulativeCount, fractionOfTotal)) print "========================================"
def docwordcount(file_name, output="spark-wc-out-docwordcount"): sc = SparkContext("local[8]", "DocWordCount") file = sc.sequenceFile(file_name) counts = file.flatMap(flatMapFunc) \ .distinct \ .map(mapFunc) \ .reduceByKey(reduceFunc) .sortByKey()
def docwordcount(file_name, output="wc-out-docwordcount"): sc = SparkContext("local[8]", "DocWordCount") file = sc.sequenceFile(file_name) """ Your code here. """ counts = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) counts.coalesce(1).saveAsTextFile(output)
def docwordcount(file_name, output="spark-wc-out-docwordcount"): sc = SparkContext("local[8]", "DocWordCount") file = sc.sequenceFile(file_name) counts = file.flatMap(flatMapFunc) \ .map(mapFunc) \ .reduceByKey(reduceFunc) counts.coalesce(1).saveAsTextFile(output)
def docwordcount(file_name, output="spark-wc-out-docwordcount"): sc = SparkContext("local[8]", "DocWordCount") file = sc.sequenceFile(file_name) counts = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) counts.coalesce(1).saveAsTextFile(output)
def index(file_name, output="spark-wc-out-index"): sc = SparkContext("local[8]", "Index") file = sc.sequenceFile(file_name) indices = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) indices.coalesce(1).saveAsTextFile(output)
def createIndices(file_name, output="spark-wc-out-createIndices"): sc = SparkContext("local[8]", "CreateIndices", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) file = sc.sequenceFile(file_name) indices = file.flatMap(flatMapFunc) \ .reduceByKey(reduceFunc) \ .sortByKey() indices.coalesce(1).saveAsTextFile(output)
def index(file_name, output="spark-wc-out-index"): sc = SparkContext("local[8]", "Index") file = sc.sequenceFile(file_name) # Same message as last exercise: Feel free to modify # this structure so it suits your functions better (and # so that it satisfies the requirements). indices = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) indices.coalesce(1).saveAsTextFile(output)
def mostPopular(file_name, output="spark-wc-out-mostPopular"): sc = SparkContext("local[8]", "WordCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) """ Reads in a sequence file FILE_NAME to be manipulated """ file = sc.sequenceFile(file_name) counts = file.flatMap(splitDocument) \ .map(toPairs) \ .reduceByKey(sumCounts) # TODO: add appropriate extra transformations here """ Takes the dataset stored in counts and writes everything out to OUTPUT """ counts.coalesce(1).saveAsTextFile(output)
def main(argv=None): """this is called if run from command line""" parser = argparse.ArgumentParser() parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False) parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False) parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True) parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true") parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False) parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true") args = parser.parse_args() if args.excludeTags and args.includeTags: print "Pick either --excludeTags or --includeTags, not both." return 1 sc = SparkContext() global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) excludedTagCount = sc.accumulator(0) includedTagCount = sc.accumulator(0) tokenCount = sc.accumulator(0) if args.inputTuples: data = sc.textFile(args.input).map(lambda x: eval(x)) else: data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue() sc.stop() # So far, this code isn't useful. The output fiile is written by the # master node into an isolated folder, and I don't know of a way to # retrieve it. if args.output != None: with codecs.open(args.output, "wb", "utf-8") as f: for k in sorted(tagPhraseCounts): f.write(k + " " + str(tagPhraseCounts[k]) + "\n") print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "excludedTagCount = %d" % excludedTagCount.value print "includedTagCount = %d" % includedTagCount.value print "tokenCount = %d" % tokenCount.value if args.printToLog: for k in sorted(tagPhraseCounts): print json.dumps(k), tagPhraseCounts[k] print "========================================"
def wordcount(file_name, output="spark-wc-out-wordcount"): sc = SparkContext("local[8]", "WordCount") """ Reads in a sequence file FILE_NAME to be manipulated """ file = sc.sequenceFile(file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ counts = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ counts.coalesce(1).saveAsTextFile(output)
def perWordDocumentCount(file_name, output="spark-wc-out-perWordDocumentCount"): sc = SparkContext("local[8]", "PerWordDocumentCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false")) file = sc.sequenceFile(file_name) """ You will need to add, remove, and/or modify function calls here. The function `distinct()` may be helpful... Be sure that your output ends up in alphabetial order. """ counts = file.flatMap(flatMapFunc) \ .distinct()\ .map(mapFunc)\ .sortByKey()\ .reduceByKey(reduceFunc) counts.coalesce(1).saveAsTextFile(output)
def docwordcount(file_name, output="spark-wc-out-docwordcount"): # These two lines of code are identical from wordcount.py sc = SparkContext("local[8]", "DocWordCount") file = sc.sequenceFile(file_name) # This is the given framework for the function. We urge # you not to change it TOO much. # You CAN complete the exercise by MOSTLY modifying the # functions above, but you ARE free to change this. # In particular, you'll probably want to add a # transformation at the very end to sort stuff... counts = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) counts.coalesce(1).saveAsTextFile(output)
def main(argv=None): """this is called if run from command line""" parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") recordCount = data.count() print "========================================" print recordCount print "========================================" sc.stop()
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") dataWithGoodJson = data.filter(goodJsonFilter) recordCount = dataWithGoodJson.count() print "========================================" print recordCount print "========================================" sc.stop()
def artist_user_matrix(file_name, output="artist_user_matrix.out"): sc = SparkContext("local[8]", "UserArtistMatrix") """ Reads in a sequence file FILE_NAME to be manipulated """ file = sc.sequenceFile(file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ counts = file.flatMap(flat_Map) \ .map(map) \ .reduceByKey(reduce) \ .sortByKey(keyfunc=lambda k: int(k)) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ counts.map(lambda x: x[0] + ' ' + x[1]).coalesce(1).saveAsTextFile(output)
def main(argv=None): """this is called if run from command line""" parser = argparse.ArgumentParser() parser.add_argument( "-c", "--count", help="Optionally report a count of records extracted.", required=False, action="store_true" ) parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True) parser.add_argument("-k", "--key", help="Required extraction key.", required=True) parser.add_argument( "-s", "--sample", type=int, default=0, help="Optionally print a sample of results.", required=False ) args = parser.parse_args() extractionKey = args.key def extractValues(value): try: d = json.loads(value) if extractionKey in d: return iter([d[extractionKey]]) else: return iter([]) except: return iter([]) sc = SparkContext() data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") extractedValuePairs = data.flatMapValues(extractValues) if args.count: recordCount = extractedValuePairs.count() print "========================================" print recordCount print "========================================" if args.sample > 0: sampleSet = extractedValuePairs.take(args.sample) print "========================================" for record in sampleSet: print record print "========================================"
def main(argv): inputSequenceDir = "" outputSequenceDir = "" try: opts, args = getopt.getopt(argv,"i:o:") except getopt.GetoptError: sys.exit(2) for (opt,arg) in opts : if opt == '-i': inputSequenceDir = arg elif opt == '-o' : outputSequenceDir = arg sc = SparkContext(appName="Fix XML App") datarawRDD = sc.sequenceFile(inputSequenceDir) cleanedRDD = datarawRDD.map(lambda x : trim(x)) outputFormatClassName = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" conf1= {"mapreduce.output.fileoutputformat.compress": "true", "mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec", "mapreduce.output.fileoutputformat.compress.type":"RECORD"} cleanedRDD.saveAsNewAPIHadoopFile(outputSequenceDir,outputFormatClassName,"org.apache.hadoop.io.Text","org.apache.hadoop.io.Text",None,None,conf1) print "OK Bye Bye"
def labelData(input): values = input[0] genre = input[1] if genre in values: label = 1 values.remove(genre) else: label = 0 values = [x if x < genre else x-1 for x in values] #shift the attributes by one index ones = [] ones = [1] * len(values) return LabeledPoint(label, SparseVector(column_num-1, values, ones)) #set hdfs path data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*") data = sc.sequenceFile("hdfs://localhost:9000/test/*") parsedData = data.filter(filterPoint).map(parsePoint).reduceByKey(lambda x, y : x + y).map(lambda (k, v) : list(set(v))) parsedData.cache() #Calculate total number of columns in the dataset column_num = parsedData.flatMap(lambda _ : _ ).distinct().count() column_id = parsedData.flatMap(lambda _ : _ ).distinct().collect() column_id.sort() #choose a genre to test, default is 100th column as target variable genre = 1 sortedData = parsedData.map(sortPoint)
#!/usr/bin/env python def clean_geonames(item): if "geonames_address" in item: addresses = item["geonames_address"] result = [] for addr in addresses: geo = {} geo["lat"] = addr["geo"]["lat"] geo["lon"] = addr["geo"]["lon"] addr["geo"] = geo if "hasAlternateName" in addr: del addr["hasAlternateName"] result.append(addr) item["geonames_address"] = result return item if __name__ == "__main__": from pyspark import SparkContext import json import sys sc = SparkContext(appName="LSH") inputFilename = sys.argv[1] outputFilename = sys.argv[2] rdd = sc.sequenceFile(inputFilename) json_rdd = rdd.mapValues(lambda x: json.loads(x)) revised_rdd = json_rdd.mapValues(lambda x: clean_geonames(x)) revised_rdd.mapValues(lambda x: json.dumps(x)).saveAsSequenceFile(outputFilename)
sqlContext.setConf('spark.sql.parquet.compression.codec','snappy') orders = sqlContext.read.load('/user/saurinchauhan/anilagrawal/cloudera/problem5/parquet-snappy-compress','parquet') # save the data to hdfs using no compression as parquet file at /user/cloudera/problem5/parquet-no-compress sqlContext.setConf('spark.sql.parquet.compression.codec','uncompressed') orders.write.save('/user/saurinchauhan/anilagrawal/cloudera/problem5/parquet-no-compress','parquet') # save the data to hdfs using snappy compression as avro file at /user/cloudera/problem5/avro-snappy sqlContext.setConf('spark.sql.avro.compression.codec','snappy') orders.write.save('/user/saurinchauhan/anilagrawal/cloudera/problem5/avro-snappy','com.databricks.spark.avro') orders = sqlContext.read.load('/user/saurinchauhan/anilagrawal/cloudera/problem5/avro-snappy','com.databricks.spark.avro') # save the data to hdfs using no compression as json file at /user/cloudera/problem5/json-no-compress orders.toJSON().saveAsTextFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/json-no-compress') # save the data to hdfs using gzip compression as json file at /user/cloudera/problem5/json-gzip orders.toJSON().saveAsTextFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/json-gzip','org.apache.hadoop.io.compress.GzipCodec') orders = sqlContext.read.load('/user/saurinchauhan/anilagrawal/cloudera/problem5/json-gzip','json') # save the data to as comma separated text using gzip compression at /user/cloudera/problem5/csv-gzip orders.rdd.map(lambda line: (str(line[0])+","+str(line[1])+","+str(line[2])+","+line[3])).saveAsTextFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/csv-gzip','org.apache.hadoop.io.compress.GzipCodec') orders = sc.sequenceFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/sequence','org.apache.hadoop.io.Text','org.apache.hadoop.io.Text') ordersDF = orders.map(lambda line: tuple(line[1].split(','))).toDF() sqlContext.setConf('spark.sql.parquet.compression.codec','uncompressed') ordersDF.write.save('/user/saurinchauhan/anilagrawal/cloudera/problem5/orc','orc')
if exp: status = exp.groupdict()["status"] request = exp.groupdict()["request"] if request: requestFields = request.split() if (len(requestFields) > 1): # converted bytearray to string return (str(requestFields[1]), str(status)) if __name__ == "__main__": sc = SparkContext(appName="SparkHdfsLogAggregator") sc.setLogLevel("ERROR") logs = sc.sequenceFile('/user/maria_dev/logs/19-07-22/2020/00/') lines = logs.map(lambda x: x[1]) urls_status = lines.map(extractURLRequestAndStatus) # Reduce by URL over a 5-minute window sliding every second urlStatusMapper = urls_status.map(lambda x: (x, 1)) urlStatusReducer = urlStatusMapper.reduceByKey(lambda x, y: x + y) ''' Sort and print the results in descending order of count ''' sortedResults = urlStatusReducer.sortBy(lambda x: -x[1]) print sortedResults.collect()
d_out = to_hdfs_url(args.output) min_df = int(args.min_document_frequency) # remove any previous output (is there a way to it from spark?) #system("hdfs dfs -rm -r %s" % d_out) # import spark-realated stuff from pyspark import SparkContext from pyspark.mllib.feature import HashingTF, IDF # init the spark context if "sc" not in globals(): sc = SparkContext( appName="TF-IDF") # Load documents (one per line). documents = sc.sequenceFile(docs_dir) #keep only the content documents = documents.map(lambda (fname, content): content.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) # IDF idf = IDF().fit(tf) tfidf = idf.transform(tf) #save tfidf.saveAsTextFile(d_out)
#create hashes and reduce by key dict = document_terms.flatMap(lambda terms: [(t, self.indexOf(t)) for t in terms]).reduceByKey(lambda a, b: a) return dict def filter_and_split(text): delims = u"\r\n\t.,;:'\"()?!$#-0123456789/*%<>@[]+`~_=&^ " translate_table = dict((ord(char), u" ") for char in delims) return text.lower().strip().translate(translate_table).split(" ") # init the spark context if "sc" not in globals(): sc = SparkContext( appName="TF-IDF") # Load documents (one per line). documents = sc.sequenceFile(docs_dir).map(lambda (fname, content): filter_and_split(content)) documents.cache() # # keep only the content (replace, lower, split, etc) # documents = documents. hashingTF = myHashingTF() # create the tf vectors tf = hashingTF.transform(documents) # create the idf vectors idf = IDF().fit(tf) tfidf = idf.transform(tf) #save tfidf.saveAsTextFile(d_out)
#outfile = '/user/ychan/data/out/karma/part-00000' #infile_type = 'sequence' infile = '/user/ychan/data/blog/blog.json' outfile = '/user/ychan/data/out/blog/blog.json.extractions' infile_type = 'text' #infile = '/user/ychan/data/twitter/tweet.json' #outfile = '/user/ychan/data/out/twitter/tweet.json.extractions' #infile_type = 'text' if infile_type == 'text': rdd = sc.textFile(infile).map(lambda x: json.loads(x)).map( lambda x: (x["url"], x)) else: rdd = sc.sequenceFile(infile).mapValues(lambda x: json.loads(x)) print('rdd count %d' % (rdd.count())) start_time = time.time() feature_rdd = rdd.mapValues(lambda x: decoder.line_to_predictions( ner_fea, Decoder(params), x, attribute_name)) #for fv in feature_rdd.take(3): # print(fv) #end_time = time.time() #print("****************** Elapsed time to transform RDD was %g seconds" % (end_time - start_time)) #start_time = time.time() feature_rdd.mapValues(lambda x: json.dumps(x)).saveAsSequenceFile(outfile) end_time = time.time() print(
agg_feature_name = agg_feature["name"] else: agg_feature_name = agg_feature[0]["name"] fc[agg_feature_name] = agg_feature return cluster if __name__ == "__main__": """ Usage: featureReducer.py [input] [output] [reducer:feature_name]... """ sc = SparkContext(appName="DigFeatureReducer") inputFilename = sys.argv[1] outputFilename = sys.argv[2] data = sc.sequenceFile(inputFilename, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") json_values = data.mapValues(lambda row: json.loads(row)) child_name = sys.argv[3] aggregations = sys.argv[4:] # result = json_values # for aggregation in aggregations: # result = rdd_aggregate(result, aggregation) if len(aggregations) > 0: result = json_values.mapValues(lambda cluster: aggregate_features(cluster, child_name, aggregations)) result.mapValues(lambda cluster: threadUtil.get_sorted_cluster(cluster)).saveAsSequenceFile(outputFilename)
#!/usr/bin/env python from pyspark import SparkContext import sys if __name__ == "__main__": def extractNumLines(line): global lines lines += 1 # print lines, ":", line, "\n\n" return line sc = SparkContext(appName="CountKeys") file = sc.sequenceFile( sys.argv[1]) rdd = file.reduceByKey(lambda x, y: x) lines = sc.accumulator( 0) num_lines = rdd.map(extractNumLines) num_lines.collect() print "Num lines: %d" % lines.value
def main(args): path_prefix = "hdfs:///amplab/sequence" conf = None sc = None sql = None for size in args.sizes: timings = {} # uservisits # rankings # crawl visitors_table_name = "visitors_%s" % size rankings_table_name = "rankings_%s" % size crawl_table_name = "crawl_%s" % size if conf is None: conf = SparkConf() if sc is None: sc = SparkContext(conf=conf) if sql is None: sql = SQLContext(sc) tic() sql.createDataFrame( sc.sequenceFile(path.join(path_prefix, size, "uservisits")) .map(lambda x: tuple(x[1].split(","))) .map(lambda x: x[:4] + parse_agent(x[4]) + x[5:]) .map(lambda x: Row(source_ip = (x[ 0]), url = (x[ 1]), date = date (x[ 2]), revenue = float(x[ 3]), os_name = (x[ 4]), os_version = (x[ 5]), browser_name = (x[ 6]), browser_version = (x[ 7]), country = (x[ 8]), language = (x[ 9]), search = (x[10]), duration = int (x[11]),)) ).registerTempTable(visitors_table_name) timings["open-and-register"] = toc() tic() os_results = sql.sql(""" SELECT os_name AS os, COUNT(FALSE) AS total_visitors, SUM(revenue) AS total_revenue, AVG(revenue) AS average_revenue, SUM(duration) AS total_duration, AVG(duration) AS average_duration FROM {} GROUP BY os_name """.format(visitors_table_name)).toPandas() timings["q-stats-by-os"] = toc() os_results.index = os_results.pop("os") tic() browser_results = sql.sql(""" SELECT browser_name AS browser, COUNT(FALSE) AS total_visitors, SUM(revenue) AS total_revenue, AVG(revenue) AS average_revenue, SUM(duration) AS total_duration, AVG(duration) AS average_duration FROM {} GROUP BY browser_name """.format(visitors_table_name)).toPandas() timings["q-stats-by-browser"] = toc() browser_results.index = browser_results.pop("browser") top_dir = path.join("results", size, "spark", str(args.nodes)) mkdir_p(top_dir) with open(path.join(top_dir, "timings"), "w") as f: for entry in timings.items(): f.write("%s, %.18e\n" % entry) f.flush() browser_results.to_pickle(path.join(top_dir, "browser")) os_results.to_pickle(path.join(top_dir, "os")) return 0
inputPath = sys.argv[1] year_to_search = sys.argv[2] outputFile = sys.argv[3] # start SparkContext conf = SparkConf().setAppName('popular_4gram') sc = SparkContext(conf=conf) log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.info('***** PYSPARK SCRIPT LOGGER INITIALIZED') # read input LOGGER.info('***** READING LZO HADOOP FILE') # LZO indexed by row i.e. <1:ngram data, 2: ngram data, 3: ngram data> files = sc.sequenceFile(inputPath, "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text") lzoRDD = files.map(lambda x: x[1]) # # map to 3-tuples of (ngram, year, count) LOGGER.info('***** SPLITTING LZO INPUT') allEntries = lzoRDD.map(lambda x: re.split(r'\t+',x)) # LOGGER.info('***** GENERATING 3-TUPLES') # # 4gram: x[0]:ngram - x[1]:year - x[2]:occurrences # formattedEntries = allEntries.map(lambda x: (x[0], x[1], x[2])) # # formattedEntries - "word word word word", 1905, 54 LOGGER.info('***** FILTERING ENTRIES TO INPUTTED YEAR') filteredEntries = allEntries.filter(lambda x: x[1] == year_to_search) LOGGER.info('***** SORT BY OCCURRENCES')
if ("-ner" in processarguments): doNer = True outNer = True else: doNer = False outNer = False # # startTime = time.clock() # Initialize Spark conf = SparkConf() spark = SparkContext(appName="estnltk_seqfile_analyser", conf=conf) logger = logging.getLogger('pyspark') # logging does not work, pickling err # Open input files input_files = spark.sequenceFile(sys.argv[1]) # Perform all processes in one map keytextrdd = input_files.map(lambda keyval : processSequencePair(keyval[0], keyval[1])) keytextrdd.coalesce(1).saveAsTextFile(sys.argv[2]) ''' # Clean the input files (html -> only text content). Justext returns paragraphs. if (isPlaintextInput == True): keytextpairs = input_files.map(lambda line : (line[0], estnltk.Text(line[1]))) else: keytextpairs = input_files.map(lambda line : (line[0], parseHtmlToText(line[1]))) # # emptytextpairs = keytextpairs.filter(lambda keytext : len(keytext[1].words)==0) keytextpairs = keytextpairs.filter(lambda keytext : len(keytext[1].words) > 0)
#!/usr/bin/python # -*- coding: utf-8 -*- from pyspark import SparkContext sc = SparkContext("local", "Simple App") data = sc.sequenceFile("programming_ranking/*", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.DoubleWritable") print (data.take(3))
arr = {} for w in res: if len(w) >= minlen: #w = w.decode('utf-8-sig').encode('utf-8').lower() w = w.decode('utf-8-sig').lower() arr[w] = 1 if not arr.has_key(w) else arr[w] + 1 return arr.items() conf = SparkConf() conf.setAppName('Spanish') sc = SparkContext(conf=conf) f = sc.sequenceFile('/project/public/collections-as-data', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.BytesWritable').cache() scanned = f.filter( lambda (n, t): n.split('/')[-1].startswith('chc') and n.endswith('txt')) count = f.count() txtcount = scanned.count() top10 = scanned.flatMap(lambda (n, t): split_to_words(t, 4)).reduceByKey( lambda a, b: a + b).sortBy(lambda x: -x[1]).take(10) print 'found ', count, 'files, ', txtcount, ' of them are scanned files' for t in top10: print t[0].encode('utf-8'), t[1]
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-e','--excludeTags', help="Comma-separated list of tags to exclude.", required=False) parser.add_argument( '--includeTags', help="Comma-separated list of tags to include.", required=False) parser.add_argument('-i','--input', help="Seq or tuple input data file.", required=True) parser.add_argument( '--inputTuples', help="The input file is in tuple format.", required=False, action='store_true') args = parser.parse_args() if args.excludeTags and args.includeTags: print "Pick either --excludeTags or --includeTags, not both." return 1 sc = SparkContext() global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) excludedTagCount = sc.accumulator(0) includedTagCount = sc.accumulator(0) tokenCount = sc.accumulator(0) if args.inputTuples: data = sc.textFile(args.input).map(lambda x: eval(x)) else: data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagTokenCounts = data.values().flatMap(getTokensMaker(args.includeTags, args.excludeTags)).countByValue() sc.stop() print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "excludedTagCount = %d" % excludedTagCount.value print "includedTagCount = %d" % includedTagCount.value print "tokenCount = %d" % tokenCount.value print "========================================" # Restructure the data, grouping by tag (token type indicator): tagTokenLists = {} for tagToken in tagTokenCounts.keys(): (tag, tokenValue) = tagToken.split(":", 1) count = tagTokenCounts[tagToken] if tag not in tagTokenLists: tagTokenLists[tag] = [] tagTokenLists[tag].append(Token(tokenValue, count)) # Process each tag seperately: for tag in tagTokenLists.keys(): tokenList = tagTokenLists[tag] # Sort the tokens by descending count and ascending token value: sortedTokenList = sorted(tokenList, key=attrgetter("value")) sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True) # Calculate the cumulative token count for each token in sorted order: totalTokens = 0 for token in sortedTokenList: totalTokens += token.count token.cumulativeCount = totalTokens # We'll use the final total later, but we need it as a float to ensure # floating point division is used: floatTotalTokens = float(totalTokens) # Print the sorted tokens with counts, fraction of total, # cumulative counts, cumulative distribution function, and # index (enumerate the tokens per tag, starting with 1). print "========================================" tokenIndex = 0 for token in sortedTokenList: tokenIndex += 1 fractionOfTotal = token.count / floatTotalTokens cumulativeFractionOfTotal = token.cumulativeCount / floatTotalTokens print("{0:8d} {1:50} {2:10d} {3:.5f} {4:10d} {5:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value), token.count, fractionOfTotal, token.cumulativeCount, cumulativeFractionOfTotal)) print "========================================"
parser = OptionParser() (c_options, args) = parser.parse_args() input_path = args[0] index = args[1] doc = args[2] sc = SparkContext(appName="DIG-LOAD_TO_ES") conf = SparkConf() es_write_conf = { "es.nodes": "10.1.94.103", "es.port": "9201", "es.nodes.discover": "false", 'es.nodes.wan.only': "true", "es.resource": index + '/' + doc, # use domain as `doc_type` "es.http.timeout": "30s", "es.http.retries": "20", "es.batch.write.retry.count": "20", # maximum number of retries set "es.batch.write.retry.wait": "300s", # on failure, time to wait prior to retrying "es.batch.size.entries": "200000", # number of docs per batch "es.mapping.id": "cdr_id", # use `doc_id` as Elasticsearch `_id` "es.input.json": "true" } es_man = ES(sc, conf, es_write_conf=es_write_conf) input_rdd = sc.sequenceFile(input_path) # .partitionBy(1000) print input_rdd.first() es_man.rdd2es(input_rdd)
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('--cache', help="Optionally cache the RDD in memory.", required=False, action='store_true') parser.add_argument('--coalesceInput', type=int, default=0, help="Reduce the number of partitions on input.", required=False) parser.add_argument('--coalesceOutput', type=int, default=0, help="Reduce the number of partitions on output.", required=False) parser.add_argument('--count', help="Count the records before writing output.", required=False, action='store_true') parser.add_argument('-d','--debug', help="Give debugging feedback.", required=False, action='store_true') parser.add_argument('--download', help="Ask Spark to download the feature list and model files to the clients.", required=False, action='store_true') parser.add_argument('-e','--embedKey', help="Embed the key in the output.", required=False) parser.add_argument('-f','--featlist', help="Input file with features to be extracted, one feature entry per line.", required=True) parser.add_argument( '--fusePhrases', '--fusedPhrases', help="Join each result phrase", required=False, action='store_true') parser.add_argument('-k','--keyed', help="The input lines are keyed.", required=False, action='store_true') parser.add_argument('--hybridJaccardConfig', help="Configuration file for hybrid Jaccard processing.", required=False) parser.add_argument('-i','--input', help="Input file with Web scraping sentences in keyed JSON Lines format.", required=True) parser.add_argument('--inputPairs', help="Test the paired input data processing path.", required=False, action='store_true') parser.add_argument('--inputSeq', help="Read input from a Hadooop SEQ data file.", required=False, action='store_true') parser.add_argument('--inputTuples', help="The input pairs are encoded as tuples", required=False, action='store_true') parser.add_argument('-j','--justTokens', help="The input JSON line data is just tokens.", required=False, action='store_true') parser.add_argument('-m','--model', help="Input model file.", required=True) parser.add_argument('-o','--output', help="Output file of phrases in keyed JSON Lines format.", required=True) parser.add_argument('--outputCompressionClass', help="Compression class for text files.", required=False) parser.add_argument('--outputPairs', help="Test the paired output data processing path.", required=False, action='store_true') parser.add_argument('--outputSeq', help="Write output to a Hadooop SEQ data file.", required=False, action='store_true') parser.add_argument('--outputTuples', help="The outout pairs are encoded as tuples", required=False, action='store_true') parser.add_argument('--pairs', help="Test the paired data processing path.", required=False, action='store_true') parser.add_argument('-p', '--partitions', help="Number of partitions.", required=False, type=int, default=1) parser.add_argument('-s','--statistics', help="Report use statistics.", required=False, action='store_true') parser.add_argument('-t','--tags', help="Restrict the set of tags and optionally rename them: tagName,tagName:newTagName,...", required=False) parser.add_argument('-v','--verbose', help="Report progress.", required=False, action='store_true') parser.add_argument('-x','--extract', help="Name the field with text or tokens.", required=False) args = parser.parse_args() if args.verbose: print "========================================" print "Starting applyCrfSparkTest." print "========================================" # Open a Spark context: if args.verbose: print "========================================" print "Creating SparkContext." print "========================================" # TODO: Use time.monotonic() in python >= 3.3 startTime = time.time() # Start timing here. sc = SparkContext() if args.verbose: print "========================================" print "SparkContext created. Application ID: " print sc.applicationId # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" # Set up a CRF tagger object: tagger = applyCrfSpark.ApplyCrfSpark(args.featlist, args.model, args.hybridJaccardConfig, inputPairs=args.inputPairs or args.pairs or args.inputSeq, inputTuples=args.inputTuples, inputKeyed=args.keyed, inputJustTokens=args.justTokens, extractFrom=args.extract, tagMap=args.tags, fusePhrases=args.fusePhrases, embedKey=args.embedKey, outputPairs=args.outputPairs or args.pairs or args.outputSeq, outputTuples=args.outputTuples, debug=args.debug, sumStatistics=args.statistics) if args.verbose: print "========================================" print "CRF++ tagger created." # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" if args.statistics: # Convert statistics to Spark accumulators: tagger.initializeSparkStatistics(sc) if args.download: # Ask Spark to download the feature list and model files from the # driver to the clients. tagger.requestSparkDownload(sc) minPartitions = args.partitions if minPartitions == 0: minPartitions = None # We'll accept three types of input files: a Sequence file, a text file # with tab-separated key and JSON Lines data, or a text file of JSON Lines # data (with the output field embedded as an entry in the top-level # dictionary). if args.inputSeq: # This is the primary input path. if args.verbose: print "========================================" print "Opening the input sequence file:" print args.input print "========================================" inputRDD = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text", minSplits=minPartitions) else: if args.verbose: print "========================================" print "Opening the input text file:" print args.input print "========================================" inputRDD = sc.textFile(args.input, minPartitions) if args.inputPairs or args.pairs: if args.verbose: print "========================================" print "Converting the text lines into input pairs by splitting on tab." print "========================================" inputRDD = inputRDD.map(lambda s: s.split('\t', 1)) if args.verbose: print "========================================" print "inputRDD is ready to read from the input file." # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" # Which is better? coalescing before processing or after processing? if args.coalesceInput > 0: numPartitions = inputRDD.getNumPartitions() if args.coalesceInput < numPartitions: if args.verbose: print "========================================" print "Coalescing partitions on input %d ==> %d" % (numPartitions, args.coalesceInput) print "========================================" inputRDD = inputRDD.coalesce(args.coalesceInput) if args.verbose: print "========================================" # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" if args.cache: print "========================================" print "Caching the input data." inputRDD.cache() # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" if args.count: print "========================================" print "Counting records..." localRecordCount = inputRDD.count() print "Record count: %d" % localRecordCount # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" # Perform the main RDD processing. if args.verbose: print "========================================" print "Requesting CRF++ tagging" print "========================================" resultsRDD = tagger.perform(inputRDD) # Which is better? coalescing before processing or after processing? if args.coalesceOutput > 0: numPartitions = resultsRDD.getNumPartitions() if args.coalesceOutput < numPartitions: if args.verbose: print "========================================" print "Coalescing partitions on output %d ==> %d" % (numPartitions, args.coalesceOutput) print "========================================" resultsRDD = resultsRDD.coalesce(args.coalesceOutput) if args.verbose: print "========================================" # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" # The output will either be a Sequence file or a text file. If # it's a text file, it might be a tab-separated pair file, or just # JSON Lines data. In either case, the main RDD processing took # care of all necessary formatting. Actually, it "will take # care", because it won't really be executed until the save # action, below, takes place. if args.outputSeq: if args.verbose: print "========================================" print "Transforming data and saving the result as a Hadoop SEQ file." print args.output print "========================================" resultsRDD.saveAsNewAPIHadoopFile(args.output, outputFormatClass="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text") else: if args.verbose: print "========================================" print "Transforming data and saving the result as a text file." print args.output print "========================================" # Paired results will be converted automatically. resultsRDD.saveAsTextFile(args.output, compressionCodecClass=args.outputCompressionClass) if args.statistics: print "========================================" tagger.showStatistics() print "========================================" if args.verbose: print "========================================" print "Ending applyCrfSparkTest." # TODO: use time.monotonic() in Python >= 3.3 duration = time.time() - startTime print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration)) print "========================================" sc.stop()
# hiveQuery = "select * from CDR where source_name='asu-twitter'" numPartitions = int(args.partitions) numFramerPartitions = numPartitions / 2 numHivePartitions = numPartitions if since == "": numHivePartitions = numPartitions * 20 hdfsRelativeFilname = outputFilename if hdfsRelativeFilname.startswith("hdfs://"): idx = hdfsRelativeFilname.find("/", 8) if idx != -1: hdfsRelativeFilname = hdfsRelativeFilname[idx:] if not args.karma: reduced_rdd_start = sc.sequenceFile( outputFilename + "/reduced_rdd").mapValues(lambda x: json.loads(x)) reduced_rdd = workflow.reduce_rdds_with_settings({"karma.provenance.properties": "source,publisher,dateRecorded:date,observedDate:date"}, numPartitions, reduced_rdd_start)\ .persist(StorageLevel.MEMORY_AND_DISK) else: if args.incremental is True: if len(since) > 0: reduced_rdd_done = hdfs_data_done( hdfs_client, hdfsRelativeFilname + "/reduced_rdd/" + since) else: reduced_rdd_done = hdfs_data_done( hdfs_client, hdfsRelativeFilname + "/reduced_rdd/initial") else: reduced_rdd_done = hdfs_data_done( hdfs_client, hdfsRelativeFilname + "/reduced_rdd")
# 写CSV def writeRecords(records): """写一些CSV记录""" output = StringIO.StringIO() writer = csv.DictWriter(output, fieldnames = ["name", "favouriteAnimal"]) for record in records: writer.writerow(record) return [output.getvalue()] pandasLovers.mapPartitions(writeRecords).saveAsTextFile(outputFile) # 读取SequenceFile val data = sc.sequenceFile(inFile, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable") # 创建HiveContext并查询数据 from pyspark.sql import HiveContext hiveCtx = HiveContext(sc) rows = hiveCtx.sql("SELECT name, age FROM usrs") firstRow = rows.first() print firstRow.name # 使用Spark SQL 读取 JSON 数据 tweets = hiveCtx.jsonFile("tweets.json") tweets.registerTempTable("tweets") results = hiveCtx.sql("SELECT usr.name, text FROM tweets")
#!/usr/bin/env python if __name__ == "__main__": from pyspark import SparkContext import json import sys sc = SparkContext(appName="sample") inputFilename = sys.argv[1] outputFilename = sys.argv[2] rdd = sc.sequenceFile(inputFilename).mapValues(lambda x: json.loads(x)) rdd2 = rdd.map(lambda (x, y): json.dumps(y)) rdd2.saveAsTextFile(outputFilename)
from pyspark import SparkContext import re from stemming.porter2 import stem import numpy as np import hadoopy #input_path="hdfs://localhost:9000/alice.txt" input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase" output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark' words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')] words_stop.append('') sc=SparkContext() lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8'))) splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop])) tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText})) tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b) tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList])) NwordsMax = 200000 def read_rdd(rdd): for key,data in rdd.takeSample(True,NwordsMax): yield key,data if hadoopy.exists(output_hdfs_path):
from pyspark import SparkContext, SparkConf import shutil conf = SparkConf().setAppName('sequenceFiles').setMaster('local').set("spark.ui.port", "4050") sc = SparkContext(conf=conf) rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)) shutil.rmtree("sequence_file") rdd.saveAsSequenceFile("sequence_file") print(sorted(sc.sequenceFile("sequence_file").collect()))
parser.add_option("-k", "--topk", dest="topk", type="int", help="top n matches", default=3) parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string", help="name for json element for matching candidates", default="candidates") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] outputFilename = args[1] print "Save to:", outputFilename clusterer = Clusterer(c_options.numPartitions, c_options.computeSimilarity, c_options.threshold) rdd = sc.sequenceFile(inputFilename).mapValues(lambda x: json.loads(x)) if len(c_options.base) > 0: base = sc.sequenceFile(c_options.base).mapValues(lambda x: json.loads(x)) result = clusterer.compute_clusters_with_base(rdd, base) else: if c_options.computeIdenticalClusters is True: (key_clusterids, result) = clusterer.compute_identical_clusters(rdd) else: result = clusterer.compute_clusters(rdd) if c_options.outputtype == "json": result = clusterer.output_json(result, c_options.topk, c_options.candidates_name) else: result = clusterer.output_csv(result, c_options.topk, c_options.separator) if c_options.outputformat == "text":
# initiate the spark conf = SparkConf() sc = SparkContext(conf=conf) # COMMAND ---------- # read RDDs for each points that are saved from Task A if DATABRICKS: rdds = [f.name[:-1] for f in dbutils.fs.ls(RDD_DIR)] else: rdds = os.listdir(RDD_DIR) rdds.remove("docf") # we don't load docf here #rdds = ["f11", "f12", "f13", "f31", "f32", "f33"] # test data for lightweight purpose for rdd_name in rdds: rdd_new = sc.sequenceFile(RDD_DIR + rdd_name) rdd_new.persist() datapoints.append(DataPoint(name=rdd_name, rdd=rdd_new)) # read docf RDD. It contains all the words i.e. all the dimensions. The value does not matter docf = sc.sequenceFile(RDD_DIR + "docf") """ initialize the K-cluster to initialize, I choose random point from the input as the centroid of the cluster both clusters from cosine similarity and euclidean distance are generated here, so the clusters from two distance functions can be computed simultaneously """ for i in range(0, K_CLUSTERS): for j in range(0, 2): rnd = random.randint(1, len(rdds) - 1) rdd_new = sc.sequenceFile(RDD_DIR + "f" + str(rnd))
from pyspark import SparkContext, SparkConf """ SequenceFiles are a popular Hadoop format composed of flat files with key/value pairs. SequenceFiles have sync markers that allow Spark to seek to a point in the file and then resynchronize with the record boundaries. This allows Spark to efficiently read SequenceFiles in parallel from multiple nodes. """ sparkconf = SparkConf().setAppName('Sequence Read').setMaster('local') sc = SparkContext(conf=sparkconf) links_file = sc.sequenceFile("hdfs://172.19.0.2/pagerank/seq/links", keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text") urls_file = sc.sequenceFile("hdfs://172.19.0.2/pagerank/seq/urls", keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text") links_rdd = links_file.values() urls_rdd = urls_file.values().persist() res_rdd = urls_rdd.join(links_rdd) print res_rdd.take(5) """ Datatype and its corresponding hadoop writable types Int IntWritable or VIntWritable2 Long LongWritable or VLongWritable2 Float FloatWritable Double DoubleWritable Boolean BooleanWritable Array[Byte] BytesWritable String Text
from pyspark.sql.types import StringType, IntegerType, LongType from pyspark.sql.window import Window def writeMYSQL(df, table): return df.write.jdbc(url="jdbc:mysql://localhost:3306/rdbms", table=table, mode="overwrite", properties={"user": "******"}) # Initializing Spark sc = SparkContext() sc.setLogLevel("WARN") rdd = sc.sequenceFile("hdfs:///flume/events/*/*/*").map( lambda x: Row(*x[1].split(","))) sqlContext = HiveContext(sc) df = sqlContext.createDataFrame(rdd, [ "purchaseDate2", "productName", "productPrice", "productCategory", "clientIPAddress" ]).cache() #sparkTopCategories topCategories = df.groupBy("productCategory")\ .count()\ .select(col("productCategory"), col("count").alias("cnt"))\ .orderBy(col("count").desc())\ .limit(10) topCategories.show() writeMYSQL(topCategories, "sparkTopCategories")
return None if __name__ == "__main__": sc = SparkContext(appName="DIG-NAME-EXTRACTION") parser = OptionParser() (c_options, args) = parser.parse_args() input_path = args[0] names_file = args[1] output_path = args[2] input_rdd = ( sc.sequenceFile(input_path) .mapValues(lambda x: json.loads(x)) .mapValues(generate_input) .filter(lambda x: x[1] is not None) ) print json.dumps(input_rdd.first()[1]) t = trie.CharTrie() names = json.load(codecs.open(names_file, "r", "utf-8")) for name in names: t[name] = name T = sc.broadcast(t) results = input_rdd.mapValues(lambda x: name_extractor(x, T)) print results.first() results.mapValues(lambda x: json.dumps(x)).saveAsSequenceFile(output_path)
return item[0], math.log(N / item[1], 10) def freq(item): return item[0], math.log(1 + item[1], 10) def relevancy(item): return item[0], item[1][0] * item[1][1] if __name__ == '__main__': scope = 0 N = 0 sc = SparkContext(appName='project') rdd = sc.sequenceFile("s3://megadados-alunos/web-brasil") # Variable scope -> 0 = Words together, 1 = Hyundai alone, 2 = Honda alone scope = 0 docs_together = rdd.flatMap(document_counter).reduceByKey(count_words) words_together = rdd.flatMap(word_counter).reduceByKey(count_words) scope = 1 docs_hyundai = rdd.flatMap(document_counter).reduceByKey(count_words) words_hyundai = rdd.flatMap(word_counter).reduceByKey(count_words) scope = 2 docs_honda = rdd.flatMap(document_counter).reduceByKey(count_words) words_honda = rdd.flatMap(word_counter).reduceByKey(count_words) N = docs_together.count()
# network-mounted shared file system. # $ PYSPARK_DRIVER_PYTHON=ipython ./bin/pyspark --master local ############################################################################### # parallelize collections ##################################### disData = sc.parallelize([1, 2, 3, 4]) dis_kv = sc.parallelize([('a', 1), ('b', 1)]) # text file, either local path, or hdfs://, s3n://, etc URI disFile = sc.textFile("README.md") # SequenceFile rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a"*x)) rdd.saveAsSequenceFile("seq_file") sorted(sc.sequenceFile("seq_file").collect()) ############################################################################### # RDD operation # RDDs support two types of operations: transformations(create a new # dataset) and actions(return a value to the driver program). # All transformations in Spark are lazy, and are only computed when an # action requires. # By default, each transformed RDD may be recomputed each time you run an # action on it. However, you may also persist an RDD in memory, disk or # replicated across multiple nodes. ############################################################################### lines = sc.textFile("README.md")