def insert_data_into_hdfs(): # Deleting the file if it existes if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s"%tb_path) # Writing to HDFS # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode. hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
def main(): if hadoopy.exists(hdfs_output): hadoopy.rmr("-skipTrash %s" % hdfs_output) hadoopy.launch(hdfs_path, hdfs_output, 'WordCount.py', files=['../stop_words.txt'])
def custom_initialization(): host= 'localhost' connection = happybase.Connection(host) wiki_table = connection.table('wiki') hdfs_path = 'wiki_index.tb' hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning) hadoopy.writetb(hdfs_path,wiki_table.scan(limit=1000)) # Writing the wiki table inot HDFS
def rmr(path): """ remove path from HDFS """ try: hadoopy.rmr(path) except IOError: return False return True
def hdfs_temp(hdfs_temp_dir=None): if hdfs_temp_dir is None: hdfs_temp_dir = HDFS_TEMP_DIR temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random())) yield temp_path if hadoopy.exists(temp_path): hadoopy.rmr(temp_path)
def insert_vector_into_hdfs(hdfs_path, iterator): # Deleting the file if it existes if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s"%hdfs_path) # Writing to HDFS # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode. hadoopy.writetb(hdfs_path, iterator)
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty(self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty( self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def throughput_test(launcher): output_path = '_hadoopy_bench/%f' % time.time() v = 'blah' kv = (v, {'client_time': time.time(), 'value_len': len(v), 'count': 0}) num_files = 3 num_kvs = 10000000 hadoopy.writetb(output_path + '/input/0', (kv for x in xrange(num_kvs))) for x in range(1, num_files): hadoopy.cp(output_path + '/input/0', output_path + '/input/%d' % x) hadoopy.freeze_script('time_job.py') # Factor out Pyinstaller time st = time.time() launcher(output_path + '/input', output_path + '/output', 'time_job.py') print((num_kvs * num_files) / (time.time() - st)) hadoopy.rmr(output_path)
def latency_test(launcher): output_path = '_hadoopy_bench/%f' % time.time() v = 'blah' kv = (v, {'client_time': time.time(), 'value_len': len(v), 'count': 0}) hadoopy.writetb(output_path + '/input', [kv]) launcher(output_path + '/input', output_path + '/output', 'time_job.py') v = hadoopy.readtb(output_path + '/output').next()[1] v['server_time'] = time.time() t0 = v['worker_time'] - v['client_time'] t1 = v['server_time'] - v['worker_time'] t2 = v['server_time'] - v['client_time'] print((t0, t1, t2)) hadoopy.rmr(output_path)
def flickr_images(tags, images_per_tag, hdfs_output, num_files=20, max_iters=1, max_pages=1, output_meta=False, api_key=None, api_secret=None, remove_output=False): tags = list(tags) if api_key is None or api_secret is None: api_key = os.environ['FLICKR_API_KEY'] api_secret = os.environ['FLICKR_API_SECRET'] tags_per_chunk = max(len(tags) / num_files, 1) if remove_output and hadoopy.exists(hdfs_output): print('Removing output dir[%s]' % hdfs_output) hadoopy.rmr(hdfs_output) cmdenvs = { 'FLICKR_API_KEY': api_key, 'FLICKR_API_SECRET': api_secret, 'MAX_ITERS': str(max_iters), 'MAX_PAGES': str(max_pages) } for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)): hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num, [(images_per_tag, tag) for tag in chunk_tags]) hadoopy.launch_frozen(hdfs_output + '/tags', hdfs_output + '/metadata', _lf('flickr_bulk.py'), cmdenvs=cmdenvs, num_reducers=num_files) output_type = 'meta' if output_meta else 'image' hadoopy.launch_frozen(hdfs_output + '/metadata', hdfs_output + '/image_metadata', _lf('file_downloader.py'), cmdenvs={'OUTPUT_TYPE': output_type})
def extractUsefulData(num_line,start_date,end_date): year = str(start_date)[:4] month = str(start_date)[4:6] start_day = str(start_date)[-2:] end_day = str(end_date)[-2:] home_dir_source = 'hdfs://BigDataPOC:8020/datalab/exp_vsb/inputData' home_dir_des = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data' for i in np.arange(int(start_day),int(end_day)+1): if i<10: date = '0'+ str(i) else: date = str(i) file_source = 'loc_bus_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' source = os.path.join(home_dir_source,file_source) home_dir_des_line = os.path.join(home_dir_des,str(num_line)) home_dir_des_month = os.path.join(home_dir_des_line,str(start_date)[:6]) if not os.path.exists(home_dir_des_line): try: os.mkdir(os.path.dirname(home_dir_des_line)) except OSError: pass if not os.path.exists(home_dir_des_month): try: os.mkdir(os.path.dirname(home_dir_des_month)) except OSError: pass if not os.path.exists(home_dir_des_month): try: os.mkdir(os.path.dirname(home_dir_des_month)) except OSError: pass file_des = 'bus_gps_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' destination = os.path.join(home_dir_des_month,file_des) if hadoopy.exists(destination): hadoopy.rmr(destination) getGpsData(source,destination) print 'it is finished:'+file_des
hiveStatementForPythonCreate += ");" print "hiveStatementForPythonCreate:"+hiveStatementForPythonCreate; hivestrcommandForPython = ["hive","-e",hiveStatementForPythonCreate] current2 = datetime.datetime.now() call(hivestrcommandForPython) current3 = datetime.datetime.now() print "hive2 second="+str((current3 - current2).seconds) #impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;insert overwrite TABLE tax_access_log_partition PARTITION (date_hour) SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour from tax.tax_access_log_python;"; #####3.delete old data for deltime in deleteTime : hdfsFilePath = '"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"' if hadoopy.exists(hdfsFilePath) == 1: print "remove file path:"+hdfsFilePath hadoopy.rmr('"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"') #####4.insert Impala impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;" impalaStatementForCreate += " insert into TABLE tax_access_log_partition PARTITION (date_hour) " impalaStatementForCreate += " SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour " impalaStatementForCreate += " from tax.tax_access_log_python" impalaStatementForCreate += " where " tempStatement =[] for insert_time in insertTime : tempStatement += ["date_time like '"+insert_time+"'"] impalaStatementForCreate += " or ".join(tempStatement) impalaStatementForCreate += ";"
#!/usr/bin/env python import hadoopy input_path = "/alice.txt" output_path = "/result" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.launch(input_path, output_path, 'WordCount.py') word_counts = dict(hadoopy.readtb(output_path)) for word in word_counts: print "%s: %d" % (word, word_counts[word])
import hadoopy import os import logging input_path = '/data/corpus_data' output_path = '/data/output' local_path = '/app/opencorpora' # Utilities def read_local_dir(local_path): for fn in os.listdir(local_path): path = os.path.join(local_path, fn) if os.path.isfile(path): yield path, open(path).read() # Cleanup and write input data if hadoopy.exists(input_path): hadoopy.rmr(input_path) if hadoopy.exists(output_path): hadoopy.rmr(output_path) hadoopy.writetb(input_path, read_local_dir(local_path)) # Launch the job hadoopy.launch_frozen(input_path, output_path, 'wc.py') # Read the first KV pair word_counts = dict(hadoopy.readtb(output_path)) for w3, tpl in word_counts.items(): if tpl[1] > 4: print tpl[0][0], tpl[0][1], tpl[0][2], tpl[1], tpl[2], tpl[3]
import hadoopy tb_path="hdfs://localhost:9000/user/user/edge_list.tb" N = 64375 if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s"%tb_path) def read_edge_wiki(file_object): while True: line = file_object.readline().split() if not line: break yield (line[0].decode('utf-8'),1.0/N),[l.decode('utf-8') for l in line[1:]] #yield line[0].decode('utf-8'),line[1].decode('utf-8') def main(): with open('edge_list.txt') as f: hadoopy.writetb(tb_path,read_edge_wiki(f)) if __name__ == '__main__': main()
def calcul_delta(vectore_before, vector_after): before = {} after = {} s = 0 for k, v in vectore_before: before[k] = v for k, v in vector_after: after[k] = v for k in before: s = np.abs(vectore_before[k] - vector_after[k]) return s ############################################################################## if hadoopy.exists(temp_vector_path): hadoopy.rmr("-skipTrash %s"%temp_vector_path) copy(eigen_vector_tb_path, temp_vector_path) while diff>0.01: eigen_vector_before = load_eigen_vector(temp_vector_path) if hadoopy.exists(temp_vector_path): hadoopy.rmr("-skipTrash %s"%temp_vector_path) hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py') eigen_vector_after = load_eigen_vector(temp_vector_path) if hadoopy.exists(eigen_vector_tb_path):
input_path = "hdfs://localhost:9000/user/user/input.tb" output_path = "hdfs://localhost:9000/user/user/vector" temp_path = "hdfs://localhost:9000/user/user/temp" def read_vector(vect): for i,v in enumerate(vect): yield str(i).encode('utf-8'),v N = 64375 diff=1. r0 = np.ones(N).astype(np.float)/N if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s"%input_path) os.system('hdfs dfs -cp '+edge_path+' '+input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s"%output_path) hadoopy.writetb(output_path,read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s"%temp_path) iteration = 0 while diff>0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s"%temp_path) hadoopy.launch(input_path,temp_path,'PageRank.py',files=[])
def main(): if hadoopy.exists(hdfs_output): hadoopy.rmr("-skipTrash %s" % hdfs_output) hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
#!/usr/bin/env python # -*- coding: utf-8 -*- import hadoopy import os import sys import happybase import numpy as np hdfs_path = 'simplewikiFromHbase' # equivaut à "http://localhost/user/user/simpleikiFromHbase" local_path = 'simplewikiFromHbaseLocal' if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s"%hdfs_path) connection = happybase.Connection('localhost','9090') if 'simplewiki' not in connection.tables(): sys.exit("Error : no simplewiki table found") else: print "OK : simplewiki table found" table_wiki = connection.table('simplewiki') NdocsMax = 30000 def read_hbase(table_hbase): for key,data in table_hbase.scan(limit=NdocsMax): yield key.decode('utf-8'),data['wiki:text'].decode('utf-8') #def read_local_dir(local_path): # for fn in os.listdir(local_path): # path = os.path.join(local_path, fn) # if os.path.isfile(path):
temp_path = "hdfs://localhost:9000/user/user/temp" def read_vector(vect): for i, v in enumerate(vect): yield str(i).encode('utf-8'), v N = 64375 diff = 1. r0 = np.ones(N).astype(np.float) / N if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s" % input_path) os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.writetb(output_path, read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) iteration = 0 while diff > 0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[])
hbase_table = 'wiki' hdfs_path = 'wiki.tb' host= 'localhost' connection = happybase.Connection(host) wiki_table = connection.table(hbase_table) def get_url_content_for_hdfs(): for url, content in wiki_table.scan(): v = content['cf:content'].encode('utf-8') yield url, v if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning) hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS # Test OK (ATIH 2/12/2015) url_content_dict = dict(hadoopy.readtb(hdfs_path)) for k, v in url_content_dict.iteritems(): print 'k = ', k print 'v = ', v break for k, v in hadoopy.readtb(hdfs_path): print 'k = ', k.encode('utf-8') print 'v = ', v.encode('utf-8') break
import hadoopy import os import sys import happybase import numpy as np hdfs_path = 'simplewikiFromHbase' local_path = 'simlewikiFromHbaseLocal' if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s" % hdfs_path) connection = happybase.Connection('localhost', '9090') if 'simplewiki' not in connection.tables(): sys.exit("Error : no simplewiki table found") else: print "OK : simplewiki table found" table_wiki = connection.table('simplewiki') NdocsMax = 30000 def read_hbase(table_hbase): for key, data in table_hbase.scan(limit=NdocsMax): yield key.decode('utf-8'), data['wiki:text'].decode('utf-8') #def read_local_dir(local_path): # for fn in os.listdir(local_path): # path = os.path.join(local_path, fn) # if os.path.isfile(path):
#input_path="hdfs://localhost:9000/alice.txt" input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase" output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark' words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')] words_stop.append('') sc=SparkContext() lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8'))) splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop])) tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText})) tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b) tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList])) NwordsMax = 200000 def read_rdd(rdd): for key,data in rdd.takeSample(True,NwordsMax): yield key,data if hadoopy.exists(output_hdfs_path): hadoopy.rmr("-skipTrash %s"%output_hdfs_path) hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))
import hadoopy tb_path = "hdfs://localhost:9000/user/user/edge_list.tb" N = 64375 if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s" % tb_path) def read_edge_wiki(file_object): while True: line = file_object.readline().split() if not line: break yield (line[0].decode('utf-8'), 1.0 / N), [l.decode('utf-8') for l in line[1:]] #yield line[0].decode('utf-8'),line[1].decode('utf-8') def main(): with open('edge_list.txt') as f: hadoopy.writetb(tb_path, read_edge_wiki(f)) if __name__ == '__main__': main()