def main(hdfs_uri): """Divolte Spark Example. This example processes published Divolte log files at a given location. It displays: 1. The total number of events in the log files. 2. An arbitrary event. 3. The ID of the session with the most events, along with the first 10 events in that session. This is equivalent to the scala example. """ sc = SparkContext() # Hadoop files are always read as an RDD of key/value pairs. Avro files contain only keys, however, # so we immediately map out the values. events_rdd = sc.newAPIHadoopFile( hdfs_uri, 'org.apache.avro.mapreduce.AvroKeyInputFormat', 'org.apache.avro.mapred.AvroKey', 'org.apache.hadoop.io.NullWritable', keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter').map(lambda (k,v): k) # We are going to process the RDD several times, so cache the original # set in cluster memory so it doesn't have to be loaded each time. events_rdd.cache() # Calculate the total number of events. total_event_count = events_rdd.count() # Get the first event in our dataset (which isn't ordered yet). an_event = events_rdd.take(1) # Find the session with the most events. (longest_session_id, longest_session_count) = events_rdd \ .map(lambda event: (event['sessionId'], 1)) \ .reduceByKey(lambda x,y: x + y) \ .reduce(lambda x,y: max(x, y, key=lambda (e, c): c)) # For the session with the most events, find the first 10 events. first_events = events_rdd \ .filter(lambda event: event['sessionId'] == longest_session_id) \ .map(lambda event: (event['location'], event['timestamp'])) \ .takeOrdered(10, lambda event: event[1]) # Simple function for rendering timestamps. def timestamp_to_string(ts): return datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M:%S') # Print the results we accumulated, with some whitespace at the # front to separate this from the logging. print "\n\n" print "Number of events in data: %d" % total_event_count print "An event:\n%s" % json.dumps(an_event, indent=2) print "Session with id '%s' has the most events: %d" % (longest_session_id, longest_session_count) print "First 10 events:" print "\n".join([" %s: %s" % (timestamp_to_string(ts), location) for (location, ts) in first_events])
def main(): sc = SparkContext("yarn", "Simple Titles") xml_posts = sc.newAPIHadoopFile("/user/cloudera/stackexchange/Posts.xml", 'com.databricks.spark.xml.XmlInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text', conf={ 'xmlinput.start': '<row', 'xmlinput.end': '/>' }) each_post = xml_posts.map(lambda x: x[1]) post_fields = each_post.map(processXmlFields).filter( lambda x: x is not None) post_fields.saveAsTextFile( '/user/cloudera/stackexchange/simple_titles_txt')
def main(hdfs_uri): """Divolte Spark Example. This example processes published Divolte log files at a given location. It displays: 1. The total number of events in the log files. 2. An arbitrary event. 3. The ID of the session with the most events, along with the first 10 events in that session. This is equivalent to the scala example. """ sc = SparkContext() events_rdd = sc.newAPIHadoopFile( hdfs_uri, 'org.apache.avro.mapreduce.AvroKeyInputFormat', 'org.apache.avro.mapred.AvroKey', 'org.apache.hadoop.io.NullWritable', keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter').map(lambda (k,v): k) events_rdd.cache() total_event_count = events_rdd.count() # Get the first event in our dataset (which isn't ordered yet). an_event = events_rdd.take(1) # Find the session with the most events. distinct_events = events_rdd \ .map(lambda event: (event['eventType'],1)).reduceByKey(add).collect() #.map(lambda event: (event['eventType'], event['timestamp'])) \ # Simple function for rendering timestamps. def timestamp_to_string(ts): return datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M:%S') # Print the results we accumulated, with some whitespace at the # front to separate this from the logging. distinct_events= dict(distinct_events) with open(ouptput_file,"w") as h: h.write(json.dumps(distinct_events, indent=2)) h.close()
def main(hdfs_uri): sc = SparkContext() events_rdd = sc.newAPIHadoopFile( hdfs_uri, 'org.apache.avro.mapreduce.AvroKeyInputFormat', 'org.apache.avro.mapred.AvroKey', 'org.apache.hadoop.io.NullWritable', keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter' ).map(lambda (k, v): k) events_rdd.cache() data = dict() total_event_count = events_rdd.count() data[VISITS] = total_event_count # Get the first event in our dataset (which isn't ordered yet). #an_event = events_rdd.take(1) # Find the session with the most events. distinct_events = events_rdd.map( lambda event: (event[EVENTYPE], 1)).reduceByKey(add).collect() #.map(lambda event: (event['eventType'], event['timestamp'])) \ # Simple function for rendering timestamps. # Print the results we accumulated, with some whitespace at the # front to separate this from the logging. distinct_events = dict(distinct_events) data[DISTINCT_EVENTS] = dict(distinct_events) unique_visits = events_rdd.map(lambda event: event[REMOTE_HOST], 1).distinct().count() data[UNIQUE_VISITS] = unique_visits session_rdd = events_rdd.map(lambda event: (event[REMOTE_HOST], event[TIMESTAMP])) avg = session_rdd.groupBy(lambda e: e[0]).map( lambda e: averageTime(e[0], e[1])).collect() data[AVERAGE_VISIT] = dict(avg) with open(ouptput_file, "w") as h: h.write(json.dumps(data, indent=2)) h.close()
avroRdd.map(lambda x: (x, None)).saveAsNewAPIHadoopFile( fileAvroOut, "org.apache.avro.mapreduce.AvroKeyOutputFormat", "org.apache.avro.mapred.AvroKey", "org.apache.hadoop.io.NullWritable", keyConverter="irt.pythonconverters.Scheme1ToAvroKeyConverter", conf=conf) # ------------------------------------ # -- read data from avro -- # ------------------------------------ avroRdd2 = sc.newAPIHadoopFile( fileAvroOut, "org.apache.avro.mapreduce.AvroKeyInputFormat", "org.apache.avro.mapred.AvroKey", "org.apache.hadoop.io.NullWritable", keyConverter="irt.pythonconverters.AvroWrapperToJavaConverter", conf=conf) crudeData = avroRdd2.collect() output = crudeData[0][0] for k in ['raw1', 'raw2', 'raw3']: output[k] = convBytesToObjectPickle(output[k]) print 80 * '#' print "input Record" print 80 * '#' pprint(record)
Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar \ /path/to/examples/avro_inputformat.py <data_file> [reader_schema_file] Assumes you have Avro data stored in <data_file>. Reader schema can be optionally specified in [reader_schema_file]. """, file=sys.stderr) exit(-1) path = sys.argv[1] sc = SparkContext(appName="AvroKeyInputFormat") conf = None if len(sys.argv) == 3: schema_rdd = sc.textFile(sys.argv[2], 1).collect() conf = {"avro.schema.input.key": reduce(lambda x, y: x + y, schema_rdd)} avro_rdd = sc.newAPIHadoopFile( path, "org.apache.avro.mapreduce.AvroKeyInputFormat", "org.apache.avro.mapred.AvroKey", "org.apache.hadoop.io.NullWritable", keyConverter="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter", conf=conf) output = avro_rdd.map(lambda x: x[0]).collect() for k in output: print(k) sc.stop()
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ time0 = time.time() # pyspark modules from pyspark import SparkContext # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script]) logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat="org.apache.avro.mapreduce.AvroKeyInputFormat" akey="org.apache.avro.mapred.AvroKey" awrite="org.apache.hadoop.io.NullWritable" aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # process data, here the map will read record from avro file # if we need a whole record we'll use lambda x: x[0], e.g. # output = avro_rdd.map(lambda x: x[0]).collect() # # if we need a particular key, e.g. jobid, we'll extract it # within lambda function, e.g. lambda x: x[0]['jobid'], e.g. # output = avro_rdd.map(lambda x: x[0]['jobid']).collect() # # in more general way we write mapper/reducer functions which will be # executed by Spark via collect call spec = None if spec_file: if os.path.isfile(spec_file): spec = json.load(open(spec_file)) else: spec = json.loads(spec_file) if script: obj = import_(script) logger.info("Use user-based script %s" % obj) if not hasattr(obj, 'MapReduce'): logger.error('Unable to find MapReduce class in %s, %s' \ % (script, obj)) ctx.stop() return mro = obj.MapReduce(spec) # example of collecting records from mapper and # passing all of them to reducer function records = avro_rdd.map(mro.mapper).collect() out = mro.reducer(records) # the map(f).reduce(f) example but it does not collect # intermediate records # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect() else: records = avro_rdd.map(basic_mapper).collect() out = basic_reducer(records) ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time()-time0)) return out
from pyspark import SparkConf, SparkContext conf = SparkConf() conf.setAppName("spark_app_wordcount_extend") sc = SparkContext(conf=conf) pairs = sc.newAPIHadoopFile( "/user/yurun/spark/textfile/", "org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text") words = pairs.map(lambda pair: pair[1]).flatMap(lambda line: line.split("\t")) pairs = words.map(lambda word: (word, 1)) counts = pairs.reduceByKey(lambda a, b: a + b) results = counts.collect() for result in results: print result sc.stop()
vec = x['vec'].copy() for word in vec: key = user_family + ':' + word if key in row: vec[word] = vec[word] + int(row[key]) count = 1 if count_loc in row: count = str(count + int(row[count_loc])) else: count = str(count) #write user vector +count temp = {} for word in vec: temp[user_family + ':' + word] = str(vec[word]) vec = temp vec[count_loc] = count vec[cf1 + ':' + article_pref + id] = 'true' table.put(user_pref + username, vec) ## call and run file_rdds=sc.newAPIHadoopFile(files, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf={"textinputformat.record.delimiter": "</page>"})\ .map(split_str)\ .map(parse_xml)\ .foreach(write_hbase)
# 初始化spark 配置 conf = SparkConf() conf.setAppName("Simple App") conf.set("spark.executor.memory","1g") sc = SparkContext(conf = conf) logFile = "./log.log" # Should be some file on your system logData = sc.textFile(logFile).cache() ######################################################### """ TOP 100 """ kFileNum = 200 total_words = None for file_id in range(1,kFileNum): YOUR_FILE = "wet_data/CC-MAIN-20150728002301-%05d-ip-10-236-191-2.ec2.internal.warc.wet"%file_id YOUR_DELIMITER = "WARC/1.0" text_file= sc.newAPIHadoopFile(YOUR_FILE,"org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf = {"textinputformat.record.delimiter":YOUR_DELIMITER}).map(lambda l:l[1]) # 打开文件, 语句的结果是一个RDD def split_and_remove_no_meaning_word(line): candidate_words = line.split() # 去掉符号 candidate_words = map(lambda word: filter(str.isalpha, str(word.encode('utf8')) ), candidate_words) no_meanning_words = ['was','over','her','them','news','they','what','like','now','use','how','see','add','help','when','who','there','here','back','also','most','over','make','years','had','into','have','may','any','other','more','has','one','which','out','their','some','than','its','off','only','his','just','get','been','were','would','our','ago','not','the','and','with','for','your','you','the','from','are','that','all','will','this','can','but','about','warcdate','warctype','contentlength','warcrecordid','contenttype','warcblockdigest','warctargeturi','warcrefersto'] words = filter(lambda word: len(word) > 2 and word not in no_meanning_words, candidate_words) return words text_file1 = text_file.map(lambda line:line.lower()).flatMap(split_and_remove_no_meaning_word) words = text_file1.map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b) if total_words == None: total_words = words else: total_words = total_words.union(words)
def main(hdfs_uri): """Divolte Spark Example. This example processes published Divolte log files at a given location. It displays: 1. The total number of events in the log files. 2. An arbitrary event. 3. The ID of the session with the most events, along with the first 10 events in that session. This is equivalent to the scala example. """ sc = SparkContext() # Hadoop files are always read as an RDD of key/value pairs. Avro files contain only keys, however, # so we immediately map out the values. events_rdd = sc.newAPIHadoopFile( hdfs_uri, 'org.apache.avro.mapreduce.AvroKeyInputFormat', 'org.apache.avro.mapred.AvroKey', 'org.apache.hadoop.io.NullWritable', keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter' ).map(lambda (k, v): k) # We are going to process the RDD several times, so cache the original # set in cluster memory so it doesn't have to be loaded each time. events_rdd.cache() # Calculate the total number of events. total_event_count = events_rdd.count() # Get the first event in our dataset (which isn't ordered yet). an_event = events_rdd.take(1) # Find the session with the most events. (longest_session_id, longest_session_count) = events_rdd \ .map(lambda event: (event['sessionId'], 1)) \ .reduceByKey(lambda x,y: x + y) \ .reduce(lambda x,y: max(x, y, key=lambda (e, c): c)) # For the session with the most events, find the first 10 events. first_events = events_rdd \ .filter(lambda event: event['sessionId'] == longest_session_id) \ .map(lambda event: (event['location'], event['timestamp'])) \ .takeOrdered(10, lambda event: event[1]) # Simple function for rendering timestamps. def timestamp_to_string(ts): return datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M:%S') # Print the results we accumulated, with some whitespace at the # front to separate this from the logging. print "\n\n" print "Number of events in data: %d" % total_event_count print "An event:\n%s" % json.dumps(an_event, indent=2) print "Session with id '%s' has the most events: %d" % ( longest_session_id, longest_session_count) print "First 10 events:" print "\n".join([ " %s: %s" % (timestamp_to_string(ts), location) for (location, ts) in first_events ])
from IOCommon import IOCommon if __name__ == "__main__": if len(sys.argv) != 3: print >> sys.stderr, "Usage: terasort <HDFS_INPUT> <HDFS_OUTPUT>" exit(-1) sc = SparkContext(appName="PythonTeraSort") reducer = int(IOCommon.getProperty("hibench.default.shuffle.parallelism")) version_api = IOCommon.getProperty("hibench.hadoop.version") # load if version_api == "hadoop1": lines = sc.textFile(sys.argv[1], 1).map(lambda x: (x[:10], x[10:])) elif version_api == "hadoop2": lines = sc.newAPIHadoopFile( sys.argv[1], "org.apache.hadoop.examples.terasort.TeraInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") # sort sortedCount = lines.sortByKey(lambda x: x, numPartitions=reducer) # save if version_api == "hadoop1": lines = sortedCount.map(lambda x: x[0] + x[1]) sortedCount.saveAsTextFile(sys.argv[2]) elif version_api == "hadoop2": sortedCount.saveAsNewAPIHadoopFile( sys.argv[2], "org.apache.hadoop.examples.terasort.TeraOutputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
labeled_doc = model.label(doc_raw_text) sentence_index = 0 for sentence in document.sentences: labeled_sentence = labeled_doc[sentence_index] sentence_index += 1 doc_output += "\n" + sentence.sent_id + "\n" doc_output += sentence.text + "\n" word_index = 0 for word in sentence.words_conll: dictionary = labeled_sentence[word_index] doc_output += word + "\t" + dictionary["label"] + "\n" word_index += 1 return doc_output conf = SparkConf().setAppName('rnn2argument') sc = SparkContext(conf=conf) print("available nodes: ", sc.defaultParallelism) text = sc.newAPIHadoopFile( "conll.txt", 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': '# newdoc'}) text.map(lambda x: parse_doc(x))\ .map(lambda x: label_doc(x)) \ .saveAsTextFile("output")
# blank line is used as delimeter # Note that the first record have have an extra line "total number:#" as its first line hadoop_conf = {"textinputformat.record.delimiter": "\n\n"} is_ec2 = True # If the submission is on EC2, you need to set access key and secret access key if is_ec2: hadoop_conf["fs.s3n.awsAccessKeyId"] = "AKIaYOURaOWNaKEYaJPQ" hadoop_conf["fs.s3n.awsSecretAccessKey"] = "v5vBmazYOURaOWNaSECRETaKEYaT8yX4jXC+mGLl" master_add = "ec2-54-213-21-124.us-west-2.compute.amazonaws.com" # Read the file with the function newAPIHadoopFile. The RDD object has elements like this: <lineNumber, textOfTweet>. # With the function textFile in the Word Count example, the hadoopConf can not be passed in. lines = sc.newAPIHadoopFile(filepath, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf=hadoop_conf) # print out records to understand the logic of RDD. # NOTE: you can not print an RDD object directly. Take a small sample from it #print(lines.take(5)) # A tweet has text "No Post Title" is considered as a bad record bad_msg = "W\tNo Post Title" # In the class, MapReduce is introduced in a simple form. In Spark, map and reduce have more variants. Key-value pair <K, V> can be key # <K> only. This function maps a record to <0> or <1> flag = lines.map(lambda x: 0 if -1 == string.find(x[1], bad_msg) else 1) # print mapped keys #print(flag.take(5))
exit(-1) conf = SparkConf().set("spark.default.parallelism", "3") sc = SparkContext(appName="SequenceFile", conf=conf) path = sys.argv[1] out = sys.argv[2] # 读取sequence file 文件 #lines = sc.sequenceFile(path, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable") # hadoop 老的api, mapred #lines = sc.hadoopFile(path, "org.apache.hadoop.mapred.SequenceFileInputFormat", # "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable") # hadoop 新的api mapreduce lines = sc.newAPIHadoopFile(path, "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable") # hadoop 新api 中getSplits getSplits(JobContext context) # hadoop 旧的api 中 getSplits(JobConf job, int numSplits) # textFile 用的旧的api 和 hadoopFile 相同 打印2个partition # newApiHadoopFile 打印一个partition print "sequence partitions: %s" % lines.getNumPartitions() results = lines.mapValues(lambda x: long(x)) for i in results.take(10): print i[0] , i[1] print results.count() # saveAsSequenceFile(path, compressionCodecClass=None) lines.saveAsSequenceFile(out, "org.apache.hadoop.io.compress.GzipCodec")
anchorDics = [{}]*num_dics for tid, tarID in enumerate(tarIDs): dicID = int(tid/maxAnchor) anchorDics[dicID][tarID] = anchorDic[tarID] anchorDicBC = sc.broadcast(anchorDics) time3 = time.time() print('load dic2', time3-time2) # rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_anchored_doc/part-[0-2]*', # rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_anchored_doc/part-[3-5]*', # rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_anchored_doc/part-*', # rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_12_anchored_doc/part-[0-2]*', rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_12_anchored_doc/part-[3-4]*', # rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_12_anchored_doc/part-*', 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text') # reuse = 0 def process(webpage): if webpage: record = tryJson(webpage) else: # return 'None1' return None if not record: # return 'None2' return None if not record[1]: return None try:
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ time0 = time.time() # pyspark modules from pyspark import SparkContext # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script]) logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat" akey = "org.apache.avro.mapred.AvroKey" awrite = "org.apache.hadoop.io.NullWritable" aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path ]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # process data, here the map will read record from avro file # if we need a whole record we'll use lambda x: x[0], e.g. # output = avro_rdd.map(lambda x: x[0]).collect() # # if we need a particular key, e.g. jobid, we'll extract it # within lambda function, e.g. lambda x: x[0]['jobid'], e.g. # output = avro_rdd.map(lambda x: x[0]['jobid']).collect() # # in more general way we write mapper/reducer functions which will be # executed by Spark via collect call spec = None if spec_file: if os.path.isfile(spec_file): spec = json.load(open(spec_file)) else: spec = json.loads(spec_file) if script: obj = import_(script) logger.info("Use user-based script %s" % obj) if not hasattr(obj, 'MapReduce'): logger.error('Unable to find MapReduce class in %s, %s' \ % (script, obj)) ctx.stop() return # we have a nested use case when one MR return WMArchive spec # we'll loop in that case until we get non-spec output while True: mro = obj.MapReduce(spec) # example of collecting records from mapper and # passing all of them to reducer function records = avro_rdd.map(mro.mapper).collect() out = mro.reducer(records) logger.info('OUTPUT %s %s' % (out, type(out))) if is_spec(out): spec = out else: break # the map(f).reduce(f) example but it does not collect # intermediate records # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect() else: records = avro_rdd.map(basic_mapper).collect() out = basic_reducer(records) ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time() - time0)) return out
from operator import add import sys from pyspark import SparkContext #Giving a Name and using the local Spark Master sc = SparkContext(appName="LZO Wordcount") if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, """ Usage: wordcount_lzo_file.py <data_file> Run with example jar: spark-submit --driver-class-path /home/spark/lib/hadoop-lzo.jar /path/to/examples/wordcount_lzo_file.py <data_file> """ exit(-1) path = sys.argv[1] print path conf = None #Reading a file in HDFS(use absolute path) csv = sc.newAPIHadoopFile(path,"com.hadoop.mapreduce.LzoTextInputFormat","org.apache.hadoop.io.LongWritable","org.apache.hadoop.io.Text").count() print csv #for k in output: # print k
## Initialize Spark Context conf = SparkConf().setAppName("XMLPARSER").setMaster("local") sc = SparkContext(conf=conf) ## File to read postsFile = sys.argv[1] if len(sys.argv) > 1 else 'Posts.xml' ## Directory to output the processed RDD output = sys.argv[2] if len(sys.argv) > 2 else 'results' ## Provide start and end tags for xml entrees xmlConf = {'xmlinput.start': '<row', 'xmlinput.end': '/>'} ## Read file using hadoop using the spark-xml input format records = sc.newAPIHadoopFile(postsFile, 'com.databricks.spark.xml.XmlInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text', conf=xmlConf) ## Remove autogenerated ids normalizedRecords = records.map(lambda x: x[1]) ## Extract the required fields rdd = normalizedRecords.map(processRows) ## Save the generated RDD into HDFS rdd.saveAsPickleFile(output) sc.stop()
''' Formatos de entrada/salida de Hadoop Spark puede interactuar con cualquier formato de fichero soportado por Hadoop - Soporta las APIs “vieja” y “nueva” - Permite acceder a otros tipos de almacenamiento (no fichero), p.e. HBase o MongoDB, a través de saveAsHadoopDataSet y/o saveAsNewAPIHadoopDataSet ''' %pyspark # Salvamos el RDD clave/valor como fichero de texto Hadoop (TextOutputFormat) rdd.saveAsNewAPIHadoopFile("file:///tmp/hadoopfileoutdir", "org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable") %sh echo 'Directorio de salida' ls -l /tmp/hadoopfileoutdir cat /tmp/hadoopfileoutdir/part-r-00001 %pyspark # Lo leemos como fichero clave-valor Hadoop (KeyValueTextInputFormat) rdd3 = sc.newAPIHadoopFile("file:///tmp/hadoopfileoutdir", "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable") print("Contenido del RDD {0}".format(rdd3.collect()))
from pyspark.sql import SQLContext def tryJson(data): try: return json.loads(data) except: return None sparkConf = SparkConf().setAppName("PySpark clueWeb09 doc collect") sc = SparkContext(conf=sparkConf) # rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_all_hadoop_t3/all_part-00000.4.preLSH', rdd1 = sc.newAPIHadoopFile( 'hdfs:///user/qile1864/anchor_frags/frags_12_all_hadoop_t9/all_part-00000.4.preLSH', 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text') mergeSim = 0.9 hash_len = 128 hash_ngram = 3 def process(anchors): if not anchors: return None # try: record = tryJson(anchors) if not record: return None
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ if script: script = get_script(script) if verbose: print("### schema: %s" % schema_file) print("### path : %s" % data_path) print("### script: %s" % script) print("### spec : %s" % spec_file) time0 = time.time() # pyspark modules from pyspark import SparkContext # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script]) logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat="org.apache.avro.mapreduce.AvroKeyInputFormat" akey="org.apache.avro.mapred.AvroKey" awrite="org.apache.hadoop.io.NullWritable" aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # process data, here the map will read record from avro file # if we need a whole record we'll use lambda x: x[0], e.g. # output = avro_rdd.map(lambda x: x[0]).collect() # # if we need a particular key, e.g. jobid, we'll extract it # within lambda function, e.g. lambda x: x[0]['jobid'], e.g. # output = avro_rdd.map(lambda x: x[0]['jobid']).collect() # # in more general way we write mapper/reducer functions which will be # executed by Spark via collect call spec = None if spec_file: if os.path.isfile(spec_file): spec = json.load(open(spec_file)) else: spec = json.loads(spec_file) if verbose: spec['verbose'] = 1 print("### spec %s" % json.dumps(spec)) if rout: spec['output'] = rout if script: obj = import_(script) logger.info("Use user-based script %s" % obj) if not hasattr(obj, 'MapReduce'): logger.error('Unable to find MapReduce class in %s, %s' \ % (script, obj)) ctx.stop() return # we have a nested use case when one MR return WMArchive spec # we'll loop in that case until we get non-spec output count = 0 while True: mro = obj.MapReduce(spec) mname = mro.__dict__.get('name', '').split('.')[0] print("### Load %s" % mname) if mname.lower().endswith('counter'): out = avro_rdd.filter(mro.mapper).count() if rout: with open(rout, 'w') as ostream: ostream.write(out) break # example of collecting records from mapper and # passing all of them to reducer function records = avro_rdd.filter(mro.mapper).collect() out = mro.reducer(records) if verbose: print("### Loop count %s" % count) if count > 3: print("### WARNING, loop counter exceed its limit") break if is_spec(out): spec = out else: break count += 1 # the map(f).reduce(f) example but it does not collect # intermediate records # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect() else: records = avro_rdd.map(basic_mapper).collect() out = basic_reducer(records) ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time()-time0)) return out
if __name__ == "__main__": if len(sys.argv) != 3: print >>sys.stderr, "Usage: terasort <HDFS_INPUT> <HDFS_OUTPUT>" exit(-1) sc = SparkContext(appName="PythonTeraSort") reducer = int(IOCommon.getProperty("hibench.default.shuffle.parallelism")) version_api = IOCommon.getProperty("hibench.hadoop.version") # load if version_api == "hadoop1": lines = sc.textFile(sys.argv[1], 1).map(lambda x: (x[:10], x[10:])) elif version_api == "hadoop2": lines = sc.newAPIHadoopFile( sys.argv[1], "org.apache.hadoop.examples.terasort.TeraInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text", ) # sort sortedCount = lines.sortByKey(lambda x: x, numPartitions=reducer) # save if version_api == "hadoop1": lines = sortedCount.map(lambda x: x[0] + x[1]) sortedCount.saveAsTextFile(sys.argv[2]) elif version_api == "hadoop2": sortedCount.saveAsNewAPIHadoopFile( sys.argv[2], "org.apache.hadoop.examples.terasort.TeraOutputFormat", "org.apache.hadoop.io.Text",
def launch_spark_job(): from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.functions import concat, col, lit readFile = sys.argv[1] k = int(sys.argv[2]) num_partitions = int(sys.argv[3]) conf = SparkConf().setAppName("reads Loader" + str(num_partitions)) sc = SparkContext(conf=conf) sc.addPyFile("utils.py") sc.setCheckpointDir( "hdfs://doop-mng1.haifa.ibm.com:8020/projects/Store_Analytics/SparkCheckPoints" ) import utils # from utils import map_read_to_anchors_list, convert_anchors_list_to_seq_edges readLines = ( sc.newAPIHadoopFile( readFile, 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': '@'}) #, .map(lambda delim_lines_tup: delim_lines_tup[1] ) # keeps just the lines and not the @ delimiter .filter(lambda x: x.startswith("SRR") ) # gets rid of entries due to '@' appearing in the wrong line .map(lambda x: x.split("\n")[:2] ) # splits the lines, keeps only the first two .filter(lambda x: len(x) == 2) # git rid of any cut off records .repartition(num_partitions) # .cache() ) print("----------------------there are %i reads" % (readLines.count())) # get new RDD including lists of kmers (with no Ns), (k+1)mers kmers = (readLines.map(lambda entry: entry[1]).flatMap( lambda read: getKmerToNextCharCounts(read, k))) print("----------------------there are %i kmers instances" % (kmers.count())) kmers_with_exts = (kmers.reduceByKey(func=lambda x, y: x + y)) print("----------------------there are %i distinct kmers" % (kmers_with_exts.count())) junctions = kmers_with_exts.filter(lambda kmer_tup: my_filter(kmer_tup)) print("----------------------there are %i junctions" % junctions.count()) # for i in junctions.take(10): # if sum(i[1])>1: # print i generate_juncs = build_partial_junctions_set() junctions_set_rdd = (junctions.mapPartitions(generate_juncs).reduceByKey( merge_sets).collect()) juncs_broadcast = sc.broadcast(junctions_set_rdd[0][1]) print("----------------------there are %i junctions in broadcast" % len(juncs_broadcast.value)) # build edge set rdd, filter out edges including a junction at some end def read_line_map_function(read_line): return utils.map_read_to_anchors_list(read_line[1], k - 10, 10, juncs_broadcast.value) edges_rdd = (readLines.map( lambda read_line: read_line_map_function(read_line)).flatMap( lambda anchors: utils.convert_anchors_list_to_seq_edges(anchors), preservesPartitioning=True).filter( lambda (a, b, c): a not in juncs_broadcast.value and b not in juncs_broadcast.value)) print("----------------------there are %i total edges" % edges_rdd.count()) # create SQLContext to be able to create dataFrame from rdd sqc = SQLContext(sc) edges_df = sqc.createDataFrame(edges_rdd, ["src", "dst", "overlap"]) vertices_df = edges_df.select( concat(col("src"), lit(" "), col("dst")).alias('id')).dropDuplicates() g = GraphFrame(vertices_df, edges_df) # vertices_df.agg(*[count(c).alias(c) for c in vertices_df.columns]).show() print("----------------------there are %i total vertices" % vertices_df.count()) # get connected components of remaining graph result = g.connectedComponents() result.select("id", "component").orderBy("component").show()
from elasticsearch import search from sparql import query_abstract from sklearn.feature_extraction.text import TfidfVectorizer import nltk sc = SparkContext("yarn", "wdps1811") KEYNAME = "WARC-TREC-ID" INFILE = sys.argv[1] OUTFILE = sys.argv[2] ELASTICSEARCH = sys.argv[3] SPARQL = sys.argv[4] rdd = sc.newAPIHadoopFile( INFILE, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf={"textinputformat.record.delimiter": "WARC/1.0"}) def find_key(payload): key = None for line in payload.splitlines(): if line.startswith(KEYNAME): key = line.split(":")[1] return key return "" vectorizer = TfidfVectorizer()
config["stage1File"] = filepath with open("config_{0}.json".format(index), "w") as outfile: json.dump(config, outfile, indent=4) subprocess.call(["./mdm", "config_{0}.json".format(index)]) subprocess.call([ "mv", "Final_MDM_{0}.root".format(index), "/hdfs/user/hjayatissa/geant_mdm_csi/stage2" ]) subprocess.call([ "/usr/local/hadoop/bin/hdfs", "dfs", "-chown", "hjayatissa", "/user/hjayatissa/geant_mdm_csi/stage2/Final_MDM_{0}.root".format( index) ]) if __name__ == "__main__": sconf = SparkConf().setAppName("mdm-CsI-2") sconf.set("spark.executor.memory", "13g") sconf.set("spark.python.worker.reuse", "false") sc = SparkContext(conf=sconf) sc.addFile("mdm") sc.addFile("config/config_isobutane_22Ne_6Li_geant_oxford.json") sc.addFile("run_oxf.mac") file_name = "hdfs://gr-gmaster.tamu.edu:9000//user/hjayatissa/geant_mdm_csi/stage1/MDM_*.root" lines = sc.newAPIHadoopFile(file_name, "edu.tamu.hadoop.RootInputFormat", "org.apache.hadoop.io.IntWritable", "org.apache.hadoop.io.Text") lines.foreach(lambda x: stage2(x))
with codecs.open(dicFile, 'r', 'utf-8') as f: lines = [line.strip() for line in f] for line in lines: url = line.split('\t')[0] docDic[url] = len(docDic) # print(docDic) docDicBC = sc.broadcast(docDic) # CW 09 rdd1 = sc.newAPIHadoopFile( 'hdfs:///corpora/corpora-thirdparty/corpus-clueweb/09-mapfile/data-r-*', # CW 12 # rdd1 = sc.newAPIHadoopFile('hdfs:///corpora/corpora-thirdparty/corpus-clueweb/12-mapfile/data-r-*', # rdd1 = sc.newAPIHadoopFile('hdfs:///corpora/corpora-thirdparty/corpus-clueweb/12-mapfile/data-r-00000', 'org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text') maxHtml = 50000 def checkDoc(data): if not data: # print('not data') # return 'not data' return try: # record = json.loads(data) record = tryJson(data) # print(str(record)[:100])
conf = SparkConf().set("spark.default.parallelism", "3") sc = SparkContext(appName="SequenceFile", conf=conf) path = sys.argv[1] out = sys.argv[2] # 读取sequence file 文件 #lines = sc.sequenceFile(path, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable") # hadoop 老的api, mapred #lines = sc.hadoopFile(path, "org.apache.hadoop.mapred.SequenceFileInputFormat", # "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable") # hadoop 新的api mapreduce lines = sc.newAPIHadoopFile( path, "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable") # hadoop 新api 中getSplits getSplits(JobContext context) # hadoop 旧的api 中 getSplits(JobConf job, int numSplits) # textFile 用的旧的api 和 hadoopFile 相同 打印2个partition # newApiHadoopFile 打印一个partition print "sequence partitions: %s" % lines.getNumPartitions() results = lines.mapValues(lambda x: long(x)) for i in results.take(10): print i[0], i[1] print results.count() # saveAsSequenceFile(path, compressionCodecClass=None) lines.saveAsSequenceFile(out, "org.apache.hadoop.io.compress.GzipCodec")
from operator import add import sys from pyspark import SparkContext #Giving a Name and using the local Spark Master sc = SparkContext(appName="LZO Wordcount") if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, """ Usage: wordcount_lzo_file.py <data_file> Run with example jar: spark-submit --driver-class-path /home/spark/lib/hadoop-lzo.jar /path/to/examples/wordcount_lzo_file.py <data_file> """ exit(-1) path = sys.argv[1] print path conf = None #Reading a file in HDFS(use absolute path) csv = sc.newAPIHadoopFile(path, "com.hadoop.mapreduce.LzoTextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text").count() print csv #for k in output: # print k
text = "org.apache.hadoop.io.LongWritable" dep_seq = sc.sequenceFile("file:///home/cloudera/departments/part-00001", text, text) dep = sc.sequenceFile("/user/cloudera/intelli", "org.apache.hadoop.io.LongWritable", "departments") # $ SPARK_CLASSPATH=/path/to/elasticsearch-hadoop.jar ./bin/pyspark # conf = {"es.resource" : "index/type"} # assume Elasticsearch is running on localhost defaults # rdd = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\ # "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf) rdd = sc.newAPIHadoopFile( path="/user/cloudera/intelli/part-m-00000", inputFormatClass= "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsBinaryInputFormat", keyClass="org.apache.hadoop.io.LongWritable", valueClass="org.apache.hadoop.io.Text") ## Above solution does not work # SO ultimate inference is if the sqoop import does not give the writable type formats in the output sequence file format # one has to built an converter class for this using attribute valueConverter="valueConverterClass" in the # sc.sequenceFile or newAPIHadoopFile making converter class is not easy # Writable Type Python Type # Text unicode str # IntWritable int # FloatWritable float # DoubleWritable float # BooleanWritable bool # BytesWritable bytearray # NullWritable None
source_path = '/' + filename dest_path = '/filter1_' + filename key_set = [ '101', '103', '195', '196', '198', '199', '200', '202', '203', '204', '205', '210', '295', '296', '298', '299', '304', '321', '350', '380', '395', '398', '399', '499', '527', '540', '541', '542', '543', '558', '564', '565', '578', '595', '596', '598', '599', '699', '700', '799', '899', '996', '998', '999' ] sc = SparkContext() source = sc.newAPIHadoopFile( source_path, 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.LongWritable', conf={'textinputformat.record.delimiter': '{1:'}) type_matched = srouce.map(lambda m: filter_message(m[1])).filter(lambda x: x) type_report = type_matched.map(lambda m: generate_IO_report(m)).reduceByKey( add).map(lambda x: toCSVLine(x)).coalesce(1).saveAsTextFile(dest_path) # receiving_bic_matched=type_matched.filter_receiving_bic(lambda x:filter_receiving_bic(x,args.receiving_bic_type)) # sending_bic_matched = receiving_bic_matched.filter_sending_bic(lambda x: filter_sending_bic(x, args.sending_bic_type))
""" if __name__ == "__main__": if len(sys.argv) != 2: print(""" Usage: parquet_inputformat.py <data_file> Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar \\ /path/to/examples/parquet_inputformat.py <data_file> Assumes you have Parquet data stored in <data_file>. """, file=sys.stderr) exit(-1) path = sys.argv[1] sc = SparkContext(appName="ParquetInputFormat") parquet_rdd = sc.newAPIHadoopFile( path, 'org.apache.parquet.avro.AvroParquetInputFormat', 'java.lang.Void', 'org.apache.avro.generic.IndexedRecord', valueConverter= 'org.apache.spark.examples.pythonconverters.IndexedRecordToJavaConverter' ) output = parquet_rdd.map(lambda x: x[1]).collect() for k in output: print(k) sc.stop()
key, name, entity_id = record yield key + '\t' + name + '\t' + entity_id if __name__ == "__main__": try: _, DOMAIN_ES, DOMAIN_KB, INPUT, OUTPUT = sys.argv except Exception: print('Usage: DOMAIN_ES, DOMAIN_TRIDENT') sys.exit(0) SPACY = spacy.load("en_core_web_sm") # Spark setup with conf from command line sc = SparkContext() # split WARC config = {"textinputformat.record.delimiter": "WARC/1.0"} # Read the Warc file to rdd rdd = sc.newAPIHadoopFile(INPUT, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf=config) # Process the warc files, result is an rdd with each element "key + '\t' + name + '\t' + FreebaseID" rdd = rdd.flatMap(record_to_html) rdd = rdd.flatMap(html_to_text) rdd = rdd.flatMap(named_entity_recognition) rdd = rdd.flatMap(generate_candidates) rdd = rdd.flatMap(output) #print(rdd.take(10)) result = rdd.saveAsTextFile(OUTPUT)
Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar \ /path/to/examples/avro_inputformat.py <data_file> [reader_schema_file] Assumes you have Avro data stored in <data_file>. Reader schema can be optionally specified in [reader_schema_file]. """ exit(-1) path = sys.argv[1] sc = SparkContext(appName="AvroKeyInputFormat") conf = None if len(sys.argv) == 3: schema_rdd = sc.textFile(sys.argv[2], 1).collect() conf = { "avro.schema.input.key": reduce(lambda x, y: x + y, schema_rdd) } avro_rdd = sc.newAPIHadoopFile( path, "org.apache.avro.mapreduce.AvroKeyInputFormat", "org.apache.avro.mapred.AvroKey", "org.apache.hadoop.io.NullWritable", keyConverter= "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter", conf=conf) output = avro_rdd.map(lambda x: x[0]).collect() for k in output: print k
sc.addPyFile(dir_path + "/element.py") sc.addPyFile(dir_path + "/node.py") sc.addPyFile(dir_path + "/quadtree.py") sc.addPyFile(dir_path + "/query.py") sc.addPyFile(dir_path + "/util.py") sc.addPyFile(dir_path + "/voxel.py") sc.addPyFile(dir_path + "/executionengine.py") # sc.addFile(path.dirname(path.abspath(__file__)), True) query = Query.defineQuery(float(argv[3])) query_broadcast = sc.broadcast(query) text_rdd = sc.newAPIHadoopFile( argv[1], "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable", conf={"textinputformat.record.delimiter": "\n\n"}) tree_rdd = text_rdd.mapPartitions(build_tree) candidates_rdd = tree_rdd.flatMap(produce_candidates_color) relations_rdd = candidates_rdd.map(prepare_matching_pairs) filter_rdd = relations_rdd.map(filter_candidates) result = relations_rdd.collect() sc.stop() save_result(result, argv[2])
from pyspark import SparkContext, SparkConf, SQLContext conf = SparkConf().setAppName("FileFormatReader") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") ff = sc.newAPIHadoopFile( "/sparkdemo/hdfs/sample.avro", "org.apache.avro.mapreduce.AvroKeyInputFormat", "org.apache.avro.mapred.AvroKey", "org.apache.hadoop.io.NullWritable", keyConverter= "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter") # dataFrame = sqlContext.createDataFrame(ff) # dataFrame.write.format("com.databricks.spark.avro").save("/sparkdemo/hdfs/") print ff.collect()
# * List the 5 most common elements for each order (word, bigram, trigram...). For each element, list the sequence of words and the number of occurances. # # Basically, you need to change all punctuations to a space and define as a word anything that is between whitespace or at the beginning or the end of a sentence, and does not consist of whitespace (strings consisiting of only white spaces should not be considered as words). The important thing here is to be simple, not to be 100% correct in terms of parsing English. Evaluation will be primarily based on identifying the 5 most frequent n-grams in correct order for all values of n. Some slack will be allowed in the values of frequency of ngrams to allow flexibility in text processing. # # This text is short enough to process on a single core using standard python. However, you are required to solve it using RDD's for the whole process. At the very end you can use `.take(5)` to bring the results to the central node for printing. # The code for reading the file and splitting it into sentences is shown below: # In[1]: #path = '../Data/Moby-Dick.txt' path = '/data/Moby-Dick.txt' textRDD = sc.newAPIHadoopFile(path, 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': "\r\n\r\n"}) \ .map(lambda x: x[1]) sentences=textRDD.flatMap(lambda x: x.split(". ")) # Note: For running the file on cluster, change the file path to `'/data/Moby-Dick.txt'` # Let `freq_ngramRDD` be the final result RDD containing the n-grams sorted by their frequency in descending order. Use the following function to print your final output: # In[2]: def printOutput(n,freq_ngramRDD): top=freq_ngramRDD.take(5) print '\n============ %d most frequent %d-grams'%(5,n)
def runActionsHistoryQuery(schema_file, data_path, verbose=None, yarn=None): """ Function to run a pyspark job to find the logs associated with the actionshistory.json file To run use a spec file with only timerange, e.g. "spec":{ "timerange":[20180706,20180706]}} and then execute: myspark --spec=large_query.spec --logfail """ if verbose: print("### schema: %s" % schema_file) print("### path : %s" % data_path) print("### spec : %s" % spec_file) time0 = time.time() # pyspark modules from pyspark import SparkContext from pyspark import SQLContext, StorageLevel # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat") logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat" akey = "org.apache.avro.mapred.AvroKey" awrite = "org.apache.hadoop.io.NullWritable" aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path ]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # load failing task rdd_failing_tasks = ctx.textFile( "hdfs:///cms/users/llayer/failing_tasks.csv") # # first step: filter the data for failing workflows and join with the actionshistory workflows # # filter the tasks - keep only failing that have a log file def getFailing(row): rec = row[0] meta = rec.get('meta_data', {}) if meta.get('jobstate', '') != 'jobfailed': return False if rec.get('LFNArray', []) == []: return False return True # create key-value structure for join def avro_rdd_KV(row): rec = row[0] task = rec["task"] return (task, rec) # filter + join task names fail_workflows = avro_rdd.filter(lambda x : getFailing(x)) \ .map(lambda x : avro_rdd_KV(x)) \ .join(rdd_failing_tasks.map(lambda x : (x,x))) # # second step: filter the data for logcollect tasks and join with the previous result # # keep only logcollect jobs def filterLogCollect(row): rec = row[0] meta = rec.get('meta_data', {}) if meta.get('jobtype', '').lower() != 'logcollect': return False return True # create KV structure using log archive as key and return important information def log_KV(row): rec = row[1][0] task = rec["task"] lfn_array = rec.get('LFNArray', []) meta = rec.get('meta_data', {}) jobstate = meta.get('jobstate', '') steps = rec.get('steps', []) status = [] site = [] for step in steps: status.append(step.get('status', '')) site.append(step.get('site', '')) out_dict = { 'task': task, 'jobstate': jobstate, 'status': status, 'site': site } return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn] # create KV structure for log collect jobs using log archives as keys def logcoll_KV(row): rec = row[0] task = rec["task"] lfn_array = rec.get('LFNArray', []) logCollect = "" for lfn in lfn_array: if 'logcollect' in lfn.lower(): logCollect = lfn meta = rec.get('meta_data', {}) jobstate = meta.get('jobstate', '') out_dict = { 'logcollect_task': task, 'logcollect_jobstate': jobstate, 'logcollect_lfn': logCollect } return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn] # log collect tasks logColl = avro_rdd.filter(lambda x: filterLogCollect(x)).flatMap( lambda x: logcoll_KV(x)) # join the frames with log archives as keys result = fail_workflows.flatMap(lambda x: log_KV(x)).join(logColl) # write back the records to hdfs result.saveAsTextFile("hdfs:///cms/users/llayer/logs10.csv") ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time() - time0)) return 0
# # * Convert all text to lower case, remove all punctuations. (Finally, the text should contain only letters, numbers and spaces) # * Count the occurance of each word and of each 2,3,4,5 - gram # * List the 5 most common elements for each order (word, bigram, trigram...). For each element, list the sequence of words and the number of occurances. # # Basically, you need to change all punctuations to a space and define as a word anything that is between whitespace or at the beginning or the end of a sentence, and does not consist of whitespace (strings consisiting of only white spaces should not be considered as words). The important thing here is to be simple, not to be 100% correct in terms of parsing English. Evaluation will be primarily based on identifying the 5 most frequent n-grams in correct order for all values of n. Some slack will be allowed in the values of frequency of ngrams to allow flexibility in text processing. # # This text is short enough to process on a single core using standard python. However, you are required to solve it using RDD's for the whole process. At the very end you can use `.take(5)` to bring the results to the central node for printing. # The code for reading the file and splitting it into sentences is shown below: # In[1]: textRDD = sc.newAPIHadoopFile('/data/Moby-Dick.txt', 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': "\r\n\r\n"}) \ .map(lambda x: x[1]) sentences = textRDD.flatMap(lambda x: x.split(". ")) # #### Note: # By default, the delimiter string in Spark is "\n". Thus, values in each partition of textRDD describe lines from the file rather than sentences. As a result, sentences may be split over multiple lines. For this input file, a good approach will be to delimit by paragraph so that each value in the RDD is one paragraph (instead of one line). Then we can split the paragraphs into sentences. # # This is done by setting the `textinputformat.record.delimiter` parameter to `"\r\n\r\n"` in the configuration file. # Let `freq_ngramRDD` be the final result RDD containing the n-grams sorted by their frequency in descending order. Use the following function to print your final output: # In[2]:
if i[1] in d: d[i[1]].append(int(i[2])) else: d[i[1]] = [int(i[2])] v = [] for i in word['reviews']: v.append((i[1], d[i[1]])) return v conf = SparkConf().setMaster("local").setAppName("WordCount") sc = SparkContext(conf=conf) pp = pprint.PrettyPrinter(indent=1) text_file = sc.newAPIHadoopFile( 'amazon-meta.txt', "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf={"textinputformat.record.delimiter": '\r\n\r\n'}) # '\r\n\r\n' amazon = text_file.map(mapWord) #pp.pprint(amazon.take(5)) ''' a = amazon .filter(lambda x: "0312982178" == x['ASIN']) \#retorna um boleando .map(produtoBestWorst) \#retorna para todos e muda valor .take(1)[0] \ #pegar somente o primeiro elemento do vetor pois se não da erro, ele quer pegar todo o vet # print(vet) # best = vet[0:5] # rate = vet[-5:] # pp.pprint(best) # pp.pprint(rate) pp.pprint(a[0:5])
if __name__ == "__main__": if len(sys.argv) != 2: print( """ Usage: parquet_inputformat.py <data_file> Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar \\ /path/to/examples/parquet_inputformat.py <data_file> Assumes you have Parquet data stored in <data_file>. """, file=sys.stderr, ) exit(-1) path = sys.argv[1] sc = SparkContext(appName="ParquetInputFormat") parquet_rdd = sc.newAPIHadoopFile( path, "parquet.avro.AvroParquetInputFormat", "java.lang.Void", "org.apache.avro.generic.IndexedRecord", valueConverter="org.apache.spark.examples.pythonconverters.IndexedRecordToJavaConverter", ) output = parquet_rdd.map(lambda x: x[1]).collect() for k in output: print(k) sc.stop()