示例#1
0
 def test_local(self):
     out_path = '%s/local_test/%f' % (self.data_path, time.time())
     hadoopy.mkdir(out_path)
     hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb')
     hadoopy.launch_local(out_path + '/wc-input-alice.tb',
                          out_path + '/out_list_cmdenvs',
                          'local.py',
                          max_input=1000,
                          cmdenvs=['TEST_ENV=10'],
                          files=[
                              'wc-input-alice.tb'
                          ])  # Just bring this along to test the files
     hadoopy.launch_local(out_path + '/wc-input-alice.tb',
                          out_path + '/out',
                          'local.py',
                          max_input=1000,
                          cmdenvs={'TEST_ENV': '10'},
                          files=[
                              'wc-input-alice.tb'
                          ])  # Just bring this along to test the files
     hadoopy.launch_local(
         ((1000 * 'a', 10000000 * 'b') for x in range(100)),
         None,
         'local.py',
         max_input=10000,
         cmdenvs=['TEST_ENV=10'],
         files=['wc-input-alice.tb'])
示例#2
0
 def __init__(self, *args, **kw):
     super(TestUsingHadoop, self).__init__(*args, **kw)
     cur_time = time.time()
     fetch_data.main()
     self.data_path = 'hadoopy-test-data/%f/' % cur_time
     try:
         hadoopy.mkdir('hadoopy-test-data')
     except IOError:
         pass
示例#3
0
 def __init__(self, *args, **kw):
     super(TestUsingHadoop, self).__init__(*args, **kw)
     cur_time = time.time()
     fetch_data.main()
     self.data_path = 'hadoopy-test-data/%f/' % cur_time
     try:
         hadoopy.mkdir('hadoopy-test-data')
     except IOError:
         pass
示例#4
0
 def test_local(self):
     out_path = '%s/local_test/%f' % (self.data_path, time.time())
     hadoopy.mkdir(out_path)
     hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb')
     hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out_list_cmdenvs', 'local.py', max_input=1000,
                          cmdenvs=['TEST_ENV=10'],
                          files=['wc-input-alice.tb'])  # Just bring this along to test the files
     hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000,
                          cmdenvs={'TEST_ENV': '10'},
                          files=['wc-input-alice.tb'])  # Just bring this along to test the files
     hadoopy.launch_local(((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000,
                          cmdenvs=['TEST_ENV=10'],
                          files=['wc-input-alice.tb'])
示例#5
0
def freeze_script(script_path, cache=True, temp_path='_hadoopy_temp'):
    """Freezes a script, puts it on hdfs, and gives you the path

    'frozen_tar_path' can be given to launch_frozen and it will use that
    instead of making its own, this is useful for repeated calls.  If a
    file with the same md5 already exists in the temp_path, it is used
    instead of putting a new copy there to avoid the file transfer.  The
    files are put into a temporary file based on the timestamp first, then
    moved to a location that is only a function of their md5 to prevent partial
    files.

    Args:
        script_path: Path to a hadoopy script
        cache: If True (default) then use previously frozen scripts.  Cache is stored in memory (not persistent).
        temp_path: HDFS temporary path (default is '_hadoopy_temp')

    Returns:
        {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path}

    Raises:
        ValueError: Script cannot be found
    """
    script_abspath = os.path.abspath(script_path)
    if not os.path.exists(script_abspath):
        raise ValueError('Script [%s] does not exist.' % script_abspath)
    try:
        if not cache:
            raise KeyError  # NOTE(brandyn): Don't use cache item
        cmds, frozen_tar_path = FREEZE_CACHE[script_abspath]
    except KeyError:
        tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time()
        freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar')
        cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name)
        md5 = _md5_file(freeze_fp.name)
        frozen_tar_path = temp_path + '/%s.tar' % md5
        if not hadoopy.exists(frozen_tar_path):
            if not hadoopy.exists(temp_path):  # CDH4 Fix
                hadoopy.mkdir(temp_path)
            hadoopy.put(freeze_fp.name, tmp_frozen_tar_path)
            try:
                hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path)
            except IOError:
                if not hadoopy.exists(frozen_tar_path):  # Check again
                    raise
    FREEZE_CACHE[script_abspath] = cmds, frozen_tar_path
    return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
示例#6
0
def doSample(jarfile, inputs, output, k):
    for item in inputs:
        if item[-1] == "/":
            name = (item[:-1]).split('/')[-1]
        else:
            name = item.split('/')[-1]
        print "item", item 
        #tmp_dir = tmp_path + name + "/"
        if hadoopy.exists(item):
            continue
        hadoopy.mkdir(item)
        #tmp_inputs.append(tmp_dir)
        real_input = data_dir + name + "/"
        for f in hadoopy.ls(real_input):
            if not hadoopy.isdir(f):
                #ff = tmp_dir + f.split('/')[-1]
                if k > 0:
                    poolSample(f, item, k)
                else:
                    commonSample(f, item, ratio)
    '''if not hadoopy.exists(output):
        hadoopy.mkdir(output)
    if hadoopy.isdir(output):
        output = output[:-1]
    if output[-1] == '/':
        output = output[:-1]
    name = output.split('/')[-1]
    tmp_output = tmp_path + name + "/"'''
    #if not hpath.exists(tmp_output):
    #    hdfs.mkdir(tmp_output)
    codegen.executeJar(jarfile, inputs, output)
    #jobid = job.getJobIDFromLog(tmp_log_dir)
    job_para = job.getJobPara()
    '''for item in tmp_inputs:
        os.system("hadoop fs -rmr " + item)
    os.system("hadoop fs -rmr " + tmp_output)'''
    return job_para
示例#7
0
	arg2 = int(sys.argv[2])
except Exception:
	arg2 = 1000


try:
	arg3 = sys.argv[3]
except Exception:
	arg3 = "/logs"

hdfs_path = arg3


if not hadoopy.exists(hdfs_path):
	print "does not exist, hence creating directory in hdfs"
	hadoopy.mkdir(hdfs_path)
else:
	print "writing to hdfs"

if not os.path.exists("./logs"):
    os.makedirs("./logs")

ts = time.time()
ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S')
fw = open("./logs"+"/"+"data"+ts,"w")

dataList = []

for i in xrange(arg2):
	string = randomDate("1/1/2010-1:30:00", "1/1/2014-4:50:60",random.random())+" "+pub_list[int(random.random()*10)%len(pub_list)]+" "+advertiser_list[int(random.random()*10)%len(advertiser_list)]+" "+ website_list[int(random.random()*10)%len(website_list)] + " " + geo_list[int(random.random()*10)%len(geo_list)] + " " +str(round(random.random(),4)) + " " + str(int(random.random()*10000))
	if (i+1)%1000 == 0 :
示例#8
0
 def setUp(self):
     try:
         hadoopy.mkdir(self.data_path)
     except IOError:
         pass
示例#9
0
 def setUp(self):
     try:
         hadoopy.mkdir(self.data_path)
     except IOError:
         pass