def main(argv): infile = argv[1] outfile1 = argv[2] outfile2 = argv[3] OUT1 = open(outfile1, 'w') OUT2 = open(outfile2, 'w') unicode_count = 0 period = 60 tag_dict = {} tag_dq = collections.deque([], maxlen=period+1) ### maximum length of deque is 61. +1 because the issue of un-order tweet within 2s last_time = '' k = 0 N_v = 0 d_total = 0 escapes = ''.join([chr(char) for char in range(1, 32)]) temp_list = 31 * ' ' with open(infile) as IN: for line in IN: (content, time1, unicode_flag) = ut.tweets_cleaned_json(line, escapes, temp_list) if not time1: continue if unicode_flag == 1: unicode_count += 1 OUT1.write(content + " (timestamp: " + time1 + ")\n") tag_list = re.findall('#[^ ]*', content) ### find all potential tags in text tag_list = [ut.clean_tag(x.strip('#')) for x in tag_list] ### remove all heading '#' in tags tag_list = filter(None, [y.lower() for x in tag_list for y in x]) ### remove empty tags and change to lower case tag_list = ['#'+x for x in tag_list] ### add '#' back in front of tags temp_str = time1.split(" ") temp_str1 = temp_str[3].split(":") time_str = temp_str[1].strip() + ' ' + temp_str[2].strip() + ' ' + temp_str[5].strip() + ' ' + temp_str1[0].strip() + ' ' + temp_str1[1].strip() + ' ' + temp_str[2].strip() time2 = datetime.strptime(time_str, '%b %d %Y %H %M %S') N_v, d_total, last_time = ut.average_degree(tag_list, time2, N_v, d_total, tag_dict, tag_dq, last_time, period) if N_v == 0: av_degree = 0 else: av_degree = float(d_total) / N_v if k == 1: ### except for the first line, (newline + printing) OUT2.write("\n%.2f" % av_degree) else: k = 1 OUT2.write("%.2f" % av_degree) OUT1.write(str(unicode_count) + " tweets contained unicode.") OUT1.close() OUT2.close()
def main(argv): if len(argv) != 4: print "4 args required!" print "Usage code_challenge_hadoop.py input.txt output1.txt output2.txt" exit(1) current_dir = os.getcwd() infile = sys.argv[1].strip() outfile1 = sys.argv[2].strip() outfile2 = sys.argv[3].strip() result_dir = current_dir + '/count_result' hadoop_home = os.getenv('HADOOP_HOME').strip() stream_jar = glob.glob(hadoop_home + '/contrib//streaming/hadoop-streaming*.jar')[0] mapper_file = current_dir + '/count_mapper_hive.py' reducer_file = current_dir + '/count_reducer_hive.py' ### clean tweets, output to ft1.txt and count for unicode OUT1 = open(outfile1, 'w') unicode_count = 0 k = 0 escapes = ''.join([chr(char) for char in range(1, 32)]) temp_list = 31 * ' ' with open(infile) as IN: for line in IN: (content, time1, unicode_flag) = ut.tweets_cleaned_json(line, escapes, temp_list) if not time1: continue if unicode_flag == 1: unicode_count += 1 if k == 1: OUT1.write("\n" + content + " (timestamp: " + time1 + ")" ) else: k = 1 OUT1.write(content + " (timestamp: " + time1 + ")" ) OUT1.close() OUT2 = open(outfile2, 'w') ### For convenience of hadoop streaming, we divide ft1.txt OUT2.write(str(unicode_count) + " tweets contained unicode.") ### into ft1_hadoop.txt (contain only tweet), and unicode_count.txt OUT2.close() ### Extract tag and combine same tag with same time (within 1 minute) input_file = outfile1 call_str = 'hadoop jar ' + stream_jar + ' -mapper ' + mapper_file + ' -reducer ' + reducer_file + \ ' -file ' + mapper_file + ' -file ' + reducer_file + ' -input ' + input_file + ' -output ' + \ result_dir + ' -mapper cat -reducer aggregate' call_list = call_list = ['hadoop', 'jar', stream_jar, '-mapper', mapper_file, '-reducer', reducer_file, '-file', mapper_file, '-file', reducer_file, '-input', input_file, '-output', result_dir, '-mapper', 'cat', '-reducer', 'aggregate'] print call_str subprocess.Popen(call_list).wait() ### Hive data warehouse building from hive_service import ThriftHive from thrift import Thrift from thrift.transport import TSocket from thrift.transport import TTransport from thrift.protocol import TBinaryProtocol input_hive = result_dir + "/part-00000" if os.path.isfile(result_dir+"/_SUCCESS") == True: try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() print "Create table from thrift require change in MySQL as below:" print "alter table SDS alter column IS_STOREDASSUBDIRECTORIES set default 0;" client.execute("drop table if exists edge_source") client.execute("""CREATE EXTERNAL TABLE if NOT EXISTS edge_source(year INT, month INT, day INT, hour INT, minute INT, tag1 STRING, tag2 STRING, count INT) ROW FORMAT DELIMITED FIELDS TERMINATED by ','""") call_str = "LOAD DATA LOCAL INPATH '" + input_hive + "' INTO TABLE edge_source" client.execute(call_str) client.execute("drop table if exists edge") client.execute("""CREATE TABLE if NOT EXISTS edge(tag1 STRING, tag2 STRING, count INT) PARTITIONED by (year INT, month INT, day INT, hour INT, minute INT) ROW FORMAT DELIMITED FIELDS TERMINATED by ','""") client.execute("set hive.exec.dynamic.partition=true") client.execute("set hive.exec.dynamic.partition.mode=nonstrict") client.execute("""INSERT OVERWRITE TABLE edge PARTITION (year, month, day, hour, minute) SELECT tag1, tag2, count, year, month, day, hour, minute FROM edge_source""") #client.execute("select * from edge") #print client.fetchAll() except Thrift.TException, tx: print "Error occurs!" print '%s' % (tx.message)
def main(argv): if len(argv) != 5: print "5 args required!" print "Usage code_challenge_hadoop.py input.txt output1.txt output2.txt output3.txt" exit(1) current_dir = os.getcwd() infile = sys.argv[1].strip() outfile1 = sys.argv[2].strip() outfile2 = sys.argv[3].strip() outfile3 = sys.argv[4].strip() result_dir = current_dir + "/count_result" hadoop_home = os.getenv("HADOOP_HOME").strip() stream_jar = glob.glob(hadoop_home + "/contrib//streaming/hadoop-streaming*.jar")[0] mapper_file = current_dir + "/count_mapper.py" reducer_file = current_dir + "/count_reducer.py" ### clean tweets, output to ft1.txt and count for unicode OUT1 = open(outfile1, "w") unicode_count = 0 k = 0 escapes = "".join([chr(char) for char in range(1, 32)]) temp_list = 31 * " " with open(infile) as IN: for line in IN: (content, time1, unicode_flag) = ut.tweets_cleaned_json(line, escapes, temp_list) if not time1: continue if unicode_flag == 1: unicode_count += 1 if k == 1: OUT1.write("\n" + content + " (timestamp: " + time1 + ")") else: k = 1 OUT1.write(content + " (timestamp: " + time1 + ")") OUT1.close() OUT2 = open(outfile2, "w") ### For convenience of hadoop streaming, we divide ft1.txt OUT2.write( str(unicode_count) + " tweets contained unicode." ) ### into ft1_hadoop.txt (contain only tweet), and unicode_count.txt OUT2.close() ### Extract tag and combine same tag with same time (within 1 minute) input_file = outfile1 call_str = ( "hadoop jar " + stream_jar + " -mapper " + mapper_file + " -reducer " + reducer_file + " -file " + mapper_file + " -file " + reducer_file + " -input " + input_file + " -output " + result_dir + " -mapper cat -reducer aggregate" ) call_list = call_list = [ "hadoop", "jar", stream_jar, "-mapper", mapper_file, "-reducer", reducer_file, "-file", mapper_file, "-file", reducer_file, "-input", input_file, "-output", result_dir, "-mapper", "cat", "-reducer", "aggregate", ] print call_str subprocess.Popen(call_list).wait() if os.path.isfile(result_dir + "/_SUCCESS") == True: period = 10 ### update average degree of last 10 min every 1 min tag_dict = {} tag_list = [] tag_dq = collections.deque([], maxlen=period) last_time = "" N_v, d_total, k = 0, 0, 0 OUT3 = open(outfile3, "w") with open(result_dir + "/part-00000") as IN: for line in IN: temp_list = line.split("\\t") time_str, tag1, tag2, count = temp_list[0], temp_list[1], temp_list[2], temp_list[3] timestamp = datetime.strptime(time_str, "%Y-%m-%d %H:%M") if k == 0: k = 1 last_time = timestamp if timestamp == last_time: if tag1 and tag2: tag_list.append((tag1, tag2, int(count))) ### if same time and not empty, add in tag list else: N_v, d_total = ut.average_degree_hadoop(tag_list, last_time, N_v, d_total, tag_dict, tag_dq, period) if N_v == 0: av_degree = 0 else: av_degree = float(d_total) / N_v tag_list = [(tag1, tag2, int(count))] OUT3.write(last_time.strftime("%Y-%m-%d %H:%M") + "\t%.2f\n" % av_degree) last_time = timestamp IN.close() N_v, d_total = ut.average_degree_hadoop(tag_list, last_time, N_v, d_total, tag_dict, tag_dq, period) if N_v == 0: av_degree = 0 else: av_degree = float(d_total) / N_v OUT3.write(last_time.strftime("%Y-%m-%d %H:%M") + "\t%.2f" % av_degree) OUT3.close() else: print "Error occurs in map reduce!" exit(1)