def test_local(self): out_path = '%s/local_test/%f' % (self.data_path, time.time()) hadoopy.mkdir(out_path) hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb') hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out_list_cmdenvs', 'local.py', max_input=1000, cmdenvs=['TEST_ENV=10'], files=[ 'wc-input-alice.tb' ]) # Just bring this along to test the files hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000, cmdenvs={'TEST_ENV': '10'}, files=[ 'wc-input-alice.tb' ]) # Just bring this along to test the files hadoopy.launch_local( ((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb'])
def __init__(self, *args, **kw): super(TestUsingHadoop, self).__init__(*args, **kw) cur_time = time.time() fetch_data.main() self.data_path = 'hadoopy-test-data/%f/' % cur_time try: hadoopy.mkdir('hadoopy-test-data') except IOError: pass
def test_local(self): out_path = '%s/local_test/%f' % (self.data_path, time.time()) hadoopy.mkdir(out_path) hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb') hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out_list_cmdenvs', 'local.py', max_input=1000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb']) # Just bring this along to test the files hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000, cmdenvs={'TEST_ENV': '10'}, files=['wc-input-alice.tb']) # Just bring this along to test the files hadoopy.launch_local(((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb'])
def freeze_script(script_path, cache=True, temp_path='_hadoopy_temp'): """Freezes a script, puts it on hdfs, and gives you the path 'frozen_tar_path' can be given to launch_frozen and it will use that instead of making its own, this is useful for repeated calls. If a file with the same md5 already exists in the temp_path, it is used instead of putting a new copy there to avoid the file transfer. The files are put into a temporary file based on the timestamp first, then moved to a location that is only a function of their md5 to prevent partial files. Args: script_path: Path to a hadoopy script cache: If True (default) then use previously frozen scripts. Cache is stored in memory (not persistent). temp_path: HDFS temporary path (default is '_hadoopy_temp') Returns: {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path} Raises: ValueError: Script cannot be found """ script_abspath = os.path.abspath(script_path) if not os.path.exists(script_abspath): raise ValueError('Script [%s] does not exist.' % script_abspath) try: if not cache: raise KeyError # NOTE(brandyn): Don't use cache item cmds, frozen_tar_path = FREEZE_CACHE[script_abspath] except KeyError: tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time() freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar') cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name) md5 = _md5_file(freeze_fp.name) frozen_tar_path = temp_path + '/%s.tar' % md5 if not hadoopy.exists(frozen_tar_path): if not hadoopy.exists(temp_path): # CDH4 Fix hadoopy.mkdir(temp_path) hadoopy.put(freeze_fp.name, tmp_frozen_tar_path) try: hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path) except IOError: if not hadoopy.exists(frozen_tar_path): # Check again raise FREEZE_CACHE[script_abspath] = cmds, frozen_tar_path return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
def doSample(jarfile, inputs, output, k): for item in inputs: if item[-1] == "/": name = (item[:-1]).split('/')[-1] else: name = item.split('/')[-1] print "item", item #tmp_dir = tmp_path + name + "/" if hadoopy.exists(item): continue hadoopy.mkdir(item) #tmp_inputs.append(tmp_dir) real_input = data_dir + name + "/" for f in hadoopy.ls(real_input): if not hadoopy.isdir(f): #ff = tmp_dir + f.split('/')[-1] if k > 0: poolSample(f, item, k) else: commonSample(f, item, ratio) '''if not hadoopy.exists(output): hadoopy.mkdir(output) if hadoopy.isdir(output): output = output[:-1] if output[-1] == '/': output = output[:-1] name = output.split('/')[-1] tmp_output = tmp_path + name + "/"''' #if not hpath.exists(tmp_output): # hdfs.mkdir(tmp_output) codegen.executeJar(jarfile, inputs, output) #jobid = job.getJobIDFromLog(tmp_log_dir) job_para = job.getJobPara() '''for item in tmp_inputs: os.system("hadoop fs -rmr " + item) os.system("hadoop fs -rmr " + tmp_output)''' return job_para
arg2 = int(sys.argv[2]) except Exception: arg2 = 1000 try: arg3 = sys.argv[3] except Exception: arg3 = "/logs" hdfs_path = arg3 if not hadoopy.exists(hdfs_path): print "does not exist, hence creating directory in hdfs" hadoopy.mkdir(hdfs_path) else: print "writing to hdfs" if not os.path.exists("./logs"): os.makedirs("./logs") ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S') fw = open("./logs"+"/"+"data"+ts,"w") dataList = [] for i in xrange(arg2): string = randomDate("1/1/2010-1:30:00", "1/1/2014-4:50:60",random.random())+" "+pub_list[int(random.random()*10)%len(pub_list)]+" "+advertiser_list[int(random.random()*10)%len(advertiser_list)]+" "+ website_list[int(random.random()*10)%len(website_list)] + " " + geo_list[int(random.random()*10)%len(geo_list)] + " " +str(round(random.random(),4)) + " " + str(int(random.random()*10000)) if (i+1)%1000 == 0 :
def setUp(self): try: hadoopy.mkdir(self.data_path) except IOError: pass