def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def freeze_script(script_path, temp_path='_hadoopy_temp'): """Freezes a script, puts it on hdfs, and gives you the path 'frozen_tar_path' can be given to launch_frozen and it will use that instead of making its own, this is useful for repeated calls. If a file with the same md5 already exists in the temp_path, it is used instead of putting a new copy there to avoid the file transfer. The files are put into a temporary file based on the timestamp first, then moved to a location that is only a function of their md5 to prevent partial files. Args: script_path: Path to a hadoopy script temp_path: HDFS temporary path (default is '_hadoopy_temp') Returns: {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path} """ tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time() freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar') cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name) md5 = _md5_file(freeze_fp.name) frozen_tar_path = temp_path + '/%s.tar' % md5 if hadoopy.exists(frozen_tar_path): return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path} hadoopy.put(freeze_fp.name, tmp_frozen_tar_path) try: hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path) except IOError, e: if hadoopy.exists(frozen_tar_path): # Check again return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path} raise e
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty(self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty( self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat', None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname, matext = os.path.splitext(mat) gopts.getintkey('blocksize', 3) schedule = gopts.getstrkey('reduce_schedule', '1') # clear the output output = args.get('output', '%s-qrr%s' % (matname, matext)) if hadoopy.exists(output): print "Removing %s" % (output) hadoopy.rm(output) outputnamefunc = lambda x: output + "_iter%i" % (x) steps = schedule.split(',') jobconfs = [] # determine the split size if 'split_size' in args: splitsize = args['split_size'] jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' + str(splitsize)) for i, step in enumerate(steps): if i > 0: input = curoutput mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper' else: mapper = True # use the command line mapper if i + 1 == len(steps): curoutput = output else: curoutput = output + "_iter%i" % (i + 1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter', i) if launch: hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step), jobconfs=jobconfs)
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=[ 'mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000' ], **kw) launcher(in_path, out_path, script_name, jobconfs={ 'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000' }, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % ( script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat', None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname, matext = os.path.splitext(mat) gopts.getintkey('blocksize', 3) schedule = gopts.getstrkey('reduce_schedule', '1') # clear the output output = args.get('output', '%s-normal%s' % (matname, matext)) if hadoopy.exists(output): print "Removing %s" % (output) hadoopy.rm(output) outputnamefunc = lambda x: output + "_iter%i" % (x) steps = schedule.split(',') for i, step in enumerate(steps): if i > 0: input = curoutput if i + 1 == len(steps): curoutput = output else: curoutput = output + "_iter%i" % (i + 1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter', i) if launch: if i > 0: mapper = "org.apache.hadoop.mapred.lib.IdentityMapper" hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step)) else: hadoopy.launch_frozen(input, curoutput, __file__, cmdenvs=gopts.cmdenv(), num_reducers=int(step))
def test_err(self): nonsense_path = 'sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk' self.assertFalse(hadoopy.exists(nonsense_path)) self.assertEquals( hadoopy.abspath(nonsense_path).rsplit('/')[-1], nonsense_path) self.assertRaises(IOError, hadoopy.ls, nonsense_path) self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def main(): if hadoopy.exists(hdfs_output): hadoopy.rmr("-skipTrash %s" % hdfs_output) hadoopy.launch(hdfs_path, hdfs_output, 'WordCount.py', files=['../stop_words.txt'])
def hdfs_temp(hdfs_temp_dir=None): if hdfs_temp_dir is None: hdfs_temp_dir = HDFS_TEMP_DIR temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random())) yield temp_path if hadoopy.exists(temp_path): hadoopy.rmr(temp_path)
def insert_vector_into_hdfs(hdfs_path, iterator): # Deleting the file if it existes if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s"%hdfs_path) # Writing to HDFS # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode. hadoopy.writetb(hdfs_path, iterator)
def _inner(in_path, out_path, script_path, *args, **kw): out_path = canonicalize_path(out_path) _new_output(out_path) if isinstance(in_path, str): in_path = canonicalize_path(in_path) else: in_path = [canonicalize_path(x) for x in in_path] gevent.sleep() if isinstance(in_path, str): _wait_on_input(in_path) else: for x in in_path: _wait_on_input(x) print('Flow: All inputs available [%s]' % str(in_path)) update_graph(in_path, out_path, script_path) if USE_EXISTING and hadoopy.exists(out_path): print(("Flow: Resusing output [%s]. 1.) You can't use the return value" " of this command (it is set to None) and 2.) The existing output is assumed to be correct.") % out_path) p = None else: p = launch(in_path, out_path, script_path, wait=False, *args, **kw) while p['process'].poll() is None: gevent.sleep(.1) print('Flow: Process completed') if p['process'].returncode: for x in range(10): print('Flow: Task failed....[%d/10]' % x) raise subprocess.CalledProcessError(p['process'].returncode, p['hadoop_cmds'][0]) _set_output(out_path) return p
def write_tb(path, fold=None): fddb_path = '/home/morariu/downloads/fddb' if fold == None: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt' else: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold if hadoopy.exists(path): # do nothing if the file already exists pass else: # otherwise, find all images in the fddb folds and put them on hdfs names = [] for fn in glob.glob(folds_glob): with open(fn, 'r') as fp: names.extend(['%s/%s.jpg' % (fddb_path, l) for l in fp.read().strip().split('\n')]) # print message about filenames that do not exist for n in names: if not os.path.exists(n): print('"%s" does not exist!' % n) # remove those filenames from the list names = filter(os.path.exists, names) # write the images to tb files hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
def insert_data_into_hdfs(): # Deleting the file if it existes if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s"%tb_path) # Writing to HDFS # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode. hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
def freeze_script(script_path, cache=True, temp_path='_hadoopy_temp'): """Freezes a script, puts it on hdfs, and gives you the path 'frozen_tar_path' can be given to launch_frozen and it will use that instead of making its own, this is useful for repeated calls. If a file with the same md5 already exists in the temp_path, it is used instead of putting a new copy there to avoid the file transfer. The files are put into a temporary file based on the timestamp first, then moved to a location that is only a function of their md5 to prevent partial files. Args: script_path: Path to a hadoopy script cache: If True (default) then use previously frozen scripts. Cache is stored in memory (not persistent). temp_path: HDFS temporary path (default is '_hadoopy_temp') Returns: {'cmds': commands_ran, 'frozen_tar_path': frozen_tar_path} Raises: ValueError: Script cannot be found """ script_abspath = os.path.abspath(script_path) if not os.path.exists(script_abspath): raise ValueError('Script [%s] does not exist.' % script_abspath) try: if not cache: raise KeyError # NOTE(brandyn): Don't use cache item cmds, frozen_tar_path = FREEZE_CACHE[script_abspath] except KeyError: tmp_frozen_tar_path = temp_path + '/%f.tar' % time.time() freeze_fp = tempfile.NamedTemporaryFile(suffix='.tar') cmds = hadoopy._freeze.freeze_to_tar(os.path.abspath(script_path), freeze_fp.name) md5 = _md5_file(freeze_fp.name) frozen_tar_path = temp_path + '/%s.tar' % md5 if not hadoopy.exists(frozen_tar_path): if not hadoopy.exists(temp_path): # CDH4 Fix hadoopy.mkdir(temp_path) hadoopy.put(freeze_fp.name, tmp_frozen_tar_path) try: hadoopy.mv(tmp_frozen_tar_path, frozen_tar_path) except IOError: if not hadoopy.exists(frozen_tar_path): # Check again raise FREEZE_CACHE[script_abspath] = cmds, frozen_tar_path return {'cmds': cmds, 'frozen_tar_path': frozen_tar_path}
def starter(args, launch=True): """ The function that calls hadoopy.launch_frozen """ gopts.args = args mat = args.get('mat',None) if mat is None: raise NameError("'mat' option not specified on the command line") input = mat matname,matext = os.path.splitext(mat) gopts.getintkey('blocksize',3) schedule = gopts.getstrkey('reduce_schedule','1') # clear the output output = args.get('output','%s-normal%s'%(matname,matext)) if hadoopy.exists(output): print "Removing %s"%(output) hadoopy.rm(output) outputnamefunc = lambda x: output+"_iter%i"%(x) steps = schedule.split(',') for i,step in enumerate(steps): if i>0: input = curoutput if i+1==len(steps): curoutput = output else: curoutput = output+"_iter%i"%(i+1) if hadoopy.exists(curoutput): hadoopy.rm(curoutput) gopts.setkey('iter',i) if launch: if i>0: mapper="org.apache.hadoop.mapred.lib.IdentityMapper" hadoopy.launch_frozen(input, curoutput, __file__, mapper=mapper, cmdenvs=gopts.cmdenv(), num_reducers=int(step)) else: hadoopy.launch_frozen(input, curoutput, __file__, cmdenvs=gopts.cmdenv(), num_reducers=int(step))
def _run_face(self, fn, **kw): in_path = self.data_path + fn out_path = "%sout-%s-%f" % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw) for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)): with open(self.out_path + "img%.8d.jpg" % num, "w") as fp: fp.write(image_data)
def _run_face(self, fn, out_path, **kw): in_path = self.data_path + fn hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def _wait_on_input(in_path): import hadoopy if not hadoopy.exists(in_path) and in_path not in HADOOPY_OUTPUTS: #print('Flow: Path [%s] does not exist yet, we will wait for it but you must create it eventually.' % in_path) print('Flow: Path [%s] does not exist yet, you will probably get an error from hadoop.' % in_path) if in_path in HADOOPY_OUTPUTS: # not hadoopy.exists(in_path) print('Flow: Waiting for [%s]' % in_path) HADOOPY_OUTPUTS.setdefault(in_path, gevent.event.Event()).wait() print('Flow: Obtained [%s]' % in_path)
def test_name(self): if not hadoopy.exists('picarus/logos'): put_logos_on_hadoop() lp = LogoProcessor() hdfs_path = 'picarus/logos' #lp.compute_db_hadoop(hdfs_path) with open('index.pb') as fp: lp.load(fp.read()) print lp.index._hashes.shape compare_to_local(lp)
def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw): fn = "out-%f-%s" % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + ".out" print(os.path.abspath(".")) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher( in_path, out_path, script_name, jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"], **kw ) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == "launch_frozen_cmd": cmd = ( 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) ) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError("Launcher not recognized") wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc["the"], 1664) self.assertEqual(wc["Alice"], 221)
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000'], **kw) launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000'}, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def _inner(out_path, *args, **kw): out_path = canonicalize_path(out_path) _new_output(out_path) print('Flow: Writer called on [%s]' % out_path) gevent.sleep() if USE_EXISTING and hadoopy.exists(out_path): print(("Flow: Resusing output [%s]. 1.) You can't use the return value" " of this command (it is set to None) and 2.) The existing output is assumed to be correct.") % out_path) out = None else: out = hdfs(out_path, *args, **kw) _set_output(out_path) EDGES.append('%s->%s' % (get_local_node(), get_path_node(out_path))) return out
def aggregateData(num_line, terminus1, terminus2, threshold, *dirs): print 'aller:{}-{} \nretour:{}-{}'.format(terminus1, terminus2, terminus2, terminus1) ter_coor1, ter_coor2 = data_extraction.getTerminusCoor( num_line, terminus1, terminus2) source_gps_dir = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data' data_aller = [] data_retour = [] for dir_name in dirs: print dir_name path = os.path.join(source_gps_dir, str(num_line), str(dir_name)) if hadoopy.isdir(path): cmd = 'hdfs dfs -du %s' % (path) p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) frames_aller = [] frames_retour = [] for file_name in p.stdout.readlines(): line_daily = extractInfo(file_name.split()[1]) line_daily_aller, line_daily_retour = data_extraction.generateDailyData( line_daily, (ter_coor1, ter_coor2), threshold, terminus1, terminus2) frames_aller.append(line_daily_aller) frames_retour.append(line_daily_retour) data_aller.append( pd.concat(frames_aller).reset_index().drop('index', axis=1)) data_retour.append( pd.concat(frames_retour).reset_index().drop('index', axis=1)) else: if hadoopy.exists(dir_name): line_daily = extractInfo(dir_name) line_daily_aller, line_daily_retour = data_extraction.generateDailyData( line_daily, (ter_coor1, ter_coor2), threshold, terminus1, terminus2) data_aller.append(line_daily_aller) data_retour.append(line_daily_retour) else: print "there are paths in args which are not directories" sys.exit(1) data_aller = pd.concat(data_aller).reset_index().drop('index', axis=1) data_retour = pd.concat(data_retour).reset_index().drop('index', axis=1) cols = [ 'DATE', 'TIME', 'LINE', 'BUS_NUM', 'X_COORDINATE', 'Y_COORDINATE', 'LONGITUDE', 'LATITUDE', 'SPEED' ] data_aller = data_aller[cols] data_retour = data_retour[cols] return data_aller, data_retour
def _run_face(self, fn, out_path, **kw): bfn = os.path.basename(fn) in_path = self.data_path + bfn hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time()) if not hadoopy.exists(in_path): hadoopy.put(fn, in_path) hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw) for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)): with open(out_path + 'img%.8d.png' % num, 'w') as fp: fp.write(image_data)
def test_readtb_writetb(self): working_path = "%s/readtb_writetb/" % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = "%s/%.5d" % (working_path, x) print(fn) data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty(working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def test_readtb_writetb(self): working_path = '%s/readtb_writetb/' % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = '%s/%.5d' % (working_path, x) print(fn) data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty( working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def extractUsefulData(num_line,start_date,end_date): year = str(start_date)[:4] month = str(start_date)[4:6] start_day = str(start_date)[-2:] end_day = str(end_date)[-2:] home_dir_source = 'hdfs://BigDataPOC:8020/datalab/exp_vsb/inputData' home_dir_des = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data' for i in np.arange(int(start_day),int(end_day)+1): if i<10: date = '0'+ str(i) else: date = str(i) file_source = 'loc_bus_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' source = os.path.join(home_dir_source,file_source) home_dir_des_line = os.path.join(home_dir_des,str(num_line)) home_dir_des_month = os.path.join(home_dir_des_line,str(start_date)[:6]) if not os.path.exists(home_dir_des_line): try: os.mkdir(os.path.dirname(home_dir_des_line)) except OSError: pass if not os.path.exists(home_dir_des_month): try: os.mkdir(os.path.dirname(home_dir_des_month)) except OSError: pass if not os.path.exists(home_dir_des_month): try: os.mkdir(os.path.dirname(home_dir_des_month)) except OSError: pass file_des = 'bus_gps_'+ str(start_date)[:6] +date+'_'+str(num_line)+'.csv' destination = os.path.join(home_dir_des_month,file_des) if hadoopy.exists(destination): hadoopy.rmr(destination) getGpsData(source,destination) print 'it is finished:'+file_des
def flickr_images(tags, images_per_tag, hdfs_output, num_files=20, max_iters=1, max_pages=1, output_meta=False, api_key=None, api_secret=None, remove_output=False): tags = list(tags) if api_key is None or api_secret is None: api_key = os.environ['FLICKR_API_KEY'] api_secret = os.environ['FLICKR_API_SECRET'] tags_per_chunk = max(len(tags) / num_files, 1) if remove_output and hadoopy.exists(hdfs_output): print('Removing output dir[%s]' % hdfs_output) hadoopy.rmr(hdfs_output) cmdenvs = { 'FLICKR_API_KEY': api_key, 'FLICKR_API_SECRET': api_secret, 'MAX_ITERS': str(max_iters), 'MAX_PAGES': str(max_pages) } for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)): hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num, [(images_per_tag, tag) for tag in chunk_tags]) hadoopy.launch_frozen(hdfs_output + '/tags', hdfs_output + '/metadata', _lf('flickr_bulk.py'), cmdenvs=cmdenvs, num_reducers=num_files) output_type = 'meta' if output_meta else 'image' hadoopy.launch_frozen(hdfs_output + '/metadata', hdfs_output + '/image_metadata', _lf('file_downloader.py'), cmdenvs={'OUTPUT_TYPE': output_type})
def doSample(jarfile, inputs, output, k): for item in inputs: if item[-1] == "/": name = (item[:-1]).split('/')[-1] else: name = item.split('/')[-1] print "item", item #tmp_dir = tmp_path + name + "/" if hadoopy.exists(item): continue hadoopy.mkdir(item) #tmp_inputs.append(tmp_dir) real_input = data_dir + name + "/" for f in hadoopy.ls(real_input): if not hadoopy.isdir(f): #ff = tmp_dir + f.split('/')[-1] if k > 0: poolSample(f, item, k) else: commonSample(f, item, ratio) '''if not hadoopy.exists(output): hadoopy.mkdir(output) if hadoopy.isdir(output): output = output[:-1] if output[-1] == '/': output = output[:-1] name = output.split('/')[-1] tmp_output = tmp_path + name + "/"''' #if not hpath.exists(tmp_output): # hdfs.mkdir(tmp_output) codegen.executeJar(jarfile, inputs, output) #jobid = job.getJobIDFromLog(tmp_log_dir) job_para = job.getJobPara() '''for item in tmp_inputs: os.system("hadoop fs -rmr " + item) os.system("hadoop fs -rmr " + tmp_output)''' return job_para
cur_args.extend(['-mapper', "'./tsqr map %i'"%(blocksize)]) if i+1==len(steps): curoutput = output else: curoutput = output+"_iter%i"%(i+1) cur_args.extend(['-jobconf',"'mapreduce.job.name="+jobname+ " (%i/%i)'"%(i+1,len(steps))]) cur_args.extend(['-input',"'"+input+"'"]) cur_args.extend(['-output',"'"+curoutput+"'"]) cur_args.extend(['-numReduceTasks', "'%i'"%(int(step))]) cmd = ['hadoop','jar',streaming_jar] cmd.extend(cur_args) print "Running Hadoop Command:" print print ' '.join(cmd) print print "End Hadoop Command" if hadoopy.exists(curoutput): print "Removing %s"%(curoutput) hadoopy.rm(curoutput) subprocess.check_call(' '.join(cmd),shell=True)
def main(): if hadoopy.exists(hdfs_output): hadoopy.rmr("-skipTrash %s" % hdfs_output) hadoopy.launch(hdfs_path, hdfs_output, "WordCount.py", files=["../stop_words.txt"])
# -*- coding: utf-8 -*- """ Created on Mon Nov 9 16:35:12 2015 @author: user """ import hadoopy input_path = 'wiki_index.tb' output_path = "/result" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s"%output_path) hadoopy.launch(input_path, output_path, 'map_red_01.py') word_urls = dict(hadoopy.readtb(output_path)) for word in word_urls: print "%s: %s, %s" % (word, word_urls[word][0], word_urls[word][1])
#!/usr/bin/env python import hadoopy input_path = "/alice.txt" output_path = "/result" if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.launch(input_path, output_path, 'WordCount.py') word_counts = dict(hadoopy.readtb(output_path)) for word in word_counts: print "%s: %d" % (word, word_counts[word])
hiveStatementForPythonCreate += " or ".join(tempStatement) hiveStatementForPythonCreate += ");" print "hiveStatementForPythonCreate:"+hiveStatementForPythonCreate; hivestrcommandForPython = ["hive","-e",hiveStatementForPythonCreate] current2 = datetime.datetime.now() call(hivestrcommandForPython) current3 = datetime.datetime.now() print "hive2 second="+str((current3 - current2).seconds) #impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;insert overwrite TABLE tax_access_log_partition PARTITION (date_hour) SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour from tax.tax_access_log_python;"; #####3.delete old data for deltime in deleteTime : hdfsFilePath = '"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"' if hadoopy.exists(hdfsFilePath) == 1: print "remove file path:"+hdfsFilePath hadoopy.rmr('"/user/hive/warehouse/tax.db/tax_access_log_partition/date_hour='+deltime+'"') #####4.insert Impala impalaStatementForCreate = "use tax;refresh tax.tax_access_log_python;" impalaStatementForCreate += " insert into TABLE tax_access_log_partition PARTITION (date_hour) " impalaStatementForCreate += " SELECT client_ip,client,userid,request,method,uri,protocal,path,params,query,fileType,fileName,status,bytes_sent, date_time,referer,useragent,host,concat(strleft(from_unixtime(unix_timestamp(date_time)),14),'00:00')as date_hour " impalaStatementForCreate += " from tax.tax_access_log_python" impalaStatementForCreate += " where " tempStatement =[] for insert_time in insertTime : tempStatement += ["date_time like '"+insert_time+"'"] impalaStatementForCreate += " or ".join(tempStatement)
def _load_input_data(self): # copy fddb data to hdfs if it is not already there if not hadoopy.exists(self.data_fn): print('Creating input data \'%s\'...' % self.data_fn) import fddb_data fddb_data.write_tb(self.data_fn, 1)
def _load_input_data(data_fn): """copy fddb data to hdfs if it is not already there""" if not hadoopy.exists(data_fn): print('Creating input data \'%s\'...' % data_fn) import fddb_data fddb_data.write_tb(data_fn)
try: arg2 = int(sys.argv[2]) except Exception: arg2 = 1000 try: arg3 = sys.argv[3] except Exception: arg3 = "/logs" hdfs_path = arg3 if not hadoopy.exists(hdfs_path): print "does not exist, hence creating directory in hdfs" hadoopy.mkdir(hdfs_path) else: print "writing to hdfs" if not os.path.exists("./logs"): os.makedirs("./logs") ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d%H%M%S') fw = open("./logs"+"/"+"data"+ts,"w") dataList = [] for i in xrange(arg2):
lambda (x, y): (x[5:].decode('utf-8'), y[5:].decode('utf-8'))) splitText = lines.map(lambda (url, text): (url, [ stem(word.group().lower()) for word in re.finditer( r"\w+", text, re.UNICODE) if word.group().lower() not in words_stop ])) tf = splitText.map(lambda (url, splittedText): (url, { word: 1.0 * splittedText.count(word) / len(splittedText) for word in splittedText })) tfWordAsKey = tf.flatMap(lambda (url, tf): [(word, [(url, tf[ word])]) for word in tf]).reduceByKey(lambda a, b: a + b) tfidf = tfWordAsKey.map(lambda (word, tfList): (word, [(url, tf * np.log10( 27474.0 / len(tfList))) for (url, tf) in tfList])) NwordsMax = 200000 def read_rdd(rdd): for key, data in rdd.takeSample(True, NwordsMax): yield key, data if hadoopy.exists(output_hdfs_path): hadoopy.rmr("-skipTrash %s" % output_hdfs_path) hadoopy.writetb(output_hdfs_path, read_rdd(tfidf))
output_path = "hdfs://localhost:9000/user/user/vector" temp_path = "hdfs://localhost:9000/user/user/temp" def read_vector(vect): for i, v in enumerate(vect): yield str(i).encode('utf-8'), v N = 64375 diff = 1. r0 = np.ones(N).astype(np.float) / N if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s" % input_path) os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.writetb(output_path, read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) iteration = 0 while diff > 0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[])
import hadoopy tb_path="hdfs://localhost:9000/user/user/edge_list.tb" N = 64375 if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s"%tb_path) def read_edge_wiki(file_object): while True: line = file_object.readline().split() if not line: break yield (line[0].decode('utf-8'),1.0/N),[l.decode('utf-8') for l in line[1:]] #yield line[0].decode('utf-8'),line[1].decode('utf-8') def main(): with open('edge_list.txt') as f: hadoopy.writetb(tb_path,read_edge_wiki(f)) if __name__ == '__main__': main()
import hadoopy import os import sys import happybase import numpy as np hdfs_path = 'simplewikiFromHbase' local_path = 'simlewikiFromHbaseLocal' if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s" % hdfs_path) connection = happybase.Connection('localhost', '9090') if 'simplewiki' not in connection.tables(): sys.exit("Error : no simplewiki table found") else: print "OK : simplewiki table found" table_wiki = connection.table('simplewiki') NdocsMax = 30000 def read_hbase(table_hbase): for key, data in table_hbase.scan(limit=NdocsMax): yield key.decode('utf-8'), data['wiki:text'].decode('utf-8') #def read_local_dir(local_path): # for fn in os.listdir(local_path): # path = os.path.join(local_path, fn) # if os.path.isfile(path):
from google_ngram_downloader import readline_google_store import wget import hadoopy import os for i in range(3, 6): gene = readline_google_store(ngram_len=i, lang='eng') while True: try: fname, url, records = next(gene) print fname if hadoopy.exists('/google-ngram/'+str(i)+'/'+fname): continue else: wget.download(url) hadoopy.put(fname, '/google-ngram/'+str(i)+'/'+fname) os.remove(fname) except StopIteration: print "END" break
sys.setdefaultencoding('utf8') hbase_table = 'wiki' hdfs_path = 'wiki.tb' host= 'localhost' connection = happybase.Connection(host) wiki_table = connection.table(hbase_table) def get_url_content_for_hdfs(): for url, content in wiki_table.scan(): v = content['cf:content'].encode('utf-8') yield url, v if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning) hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS # Test OK (ATIH 2/12/2015) url_content_dict = dict(hadoopy.readtb(hdfs_path)) for k, v in url_content_dict.iteritems(): print 'k = ', k print 'v = ', v break for k, v in hadoopy.readtb(hdfs_path): print 'k = ', k.encode('utf-8') print 'v = ', v.encode('utf-8') break
def test_err(self): nonsense_path = "sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk" self.assertFalse(hadoopy.exists(nonsense_path)) self.assertEquals(hadoopy.abspath(nonsense_path).rsplit("/")[-1], nonsense_path) self.assertRaises(IOError, hadoopy.ls, nonsense_path) self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def validate(argv): keywords_selected = False keywords = [] verbose = False hdfs_path = None local_path = None roll_size_selected = False roll_size = None consumer_key = None consumer_secret = None access_token_key = None access_token_secret = None credential_file_selected = False keywords_credential_file_selected = False credential_file_path = '' since_tweet_id = None https_proxy = None try: # second arg is (short) options, should be separated by : # third arg is long options, as an array opts, args = getopt.getopt(argv, "", ["keywords=", "verbose", "write-to-hdfs=", "write-to-local=", "roll-size=", "consumer-key=", "consumer-secret=", "access-token-key=", "access-token-secret=", "credential-file=", "keywords-credential-file=", "since-tweet-id=", "https-proxy="]) except getopt.GetoptError: print Option.print_help() sys.exit(2) for opt, arg in opts: if opt == '--help': Option.print_help() elif opt == '--keywords': if keywords_credential_file_selected: print "Error: You cannot choose --keywords and --keywords-credential-file at the same time" sys.exit(2) else: keywords_selected = True keywords = arg.split(",") elif opt == '--verbose': verbose = True elif opt == '--write-to-hdfs': # validate and parse hdfs path hdfs_path = Option.parse_hdfs_path(arg) # if not hdfs_path or not hadoopy.exists(hdfs_path): if not hdfs_path: print "Error: URL should be valid. Ex. hdfs://<host>:<port>/hdfs/dir" sys.exit(2) elif not hadoopy.exists(hdfs_path): print "Error: HDFS path does not exist" sys.exit(2) elif not hdfs_path.endswith("/"): hdfs_path = hdfs_path + "/" elif opt == '--write-to-local': # validate local path if not path.isdir(arg): print "Error: Local path is not a directory or does not exist." sys.exit(2) else: local_path = arg if arg.endswith('/') else arg + '/' elif opt == '--roll-size': right_format, total_size, message = Option.parse_roll_size(arg, Util.MIN_ROLL_SIZE, Util.MAX_ROLL_SIZE) if right_format: roll_size_selected = True roll_size = total_size else: print message sys.exit(2) elif opt == '--credential-file': if keywords_credential_file_selected: print "Error: You cannot choose --credential-file and --keywords-credential-file at the same time" sys.exit(2) else: credential_file_selected = True credential_file_path = arg elif opt == '--keywords-credential-file': if keywords_selected or credential_file_selected: print "Error: You cannot choose --keywords-credential-file with --keywords and/or --keywords-credential-file" sys.exit(2) else: keywords_credential_file_selected = True credential_file_path = arg elif opt == '--consumer-key': consumer_key = arg elif opt == '--consumer-secret': consumer_secret = arg elif opt == '--access-token-key': access_token_key = arg elif opt == '--access-token-secret': access_token_secret = arg elif opt == '--since-tweet-id': if len(str(arg)) < 18: print "Warning: Invalid tweet id; ignoring set value." else: since_tweet_id = arg elif opt == '--https-proxy': if not Option.parse_https_proxy(arg): print "Warning: Possibly invalid HTTPS PROXY URL string; ignoring set value." else: https_proxy = arg if not keywords_selected and not keywords_credential_file_selected: print "Error: Keywords are required" sys.exit(2) if credential_file_selected: valid, error_message, consumer_key, consumer_secret, access_token_key, access_token_secret, temp_keywords = Option.validate_keywords_credential_file(credential_file_path, False) if not valid: print error_message sys.exit(2) if keywords_credential_file_selected: valid, error_message, consumer_key, consumer_secret, access_token_key, access_token_secret, keywords = Option.validate_keywords_credential_file(credential_file_path, True) if not valid: print error_message sys.exit(2) if not (consumer_key and consumer_secret and access_token_key and access_token_secret): print str(consumer_key) + ', ' + str(consumer_secret) + ', ' + str(access_token_key) + ', ' + str(access_token_secret) print "Error: Incomplete Twitter credentials." sys.exit(2) if not roll_size_selected: if hdfs_path or local_path: print "Info: --roll-size not specified. Will default to roll size = 1048576 bytes (1 MB)." roll_size_selected = True roll_size = Util.DEFAULT_ROLL_SIZE else: if not hdfs_path and not local_path: print "Warning: --roll-size flag ignored. No file to save to." roll_size = None print 'keywords: ' + ",".join(keywords) print 'verbose: ' + str(verbose) print 'hdfs_path: ' + str(hdfs_path) print 'local_path: ' + str(local_path) print 'roll_size_selected: ' + str(roll_size_selected) print 'roll_size: ' + str(roll_size) print 'consumer_key: ' + str(consumer_key) print 'consumer_secret: ' + str(consumer_secret) print 'access_token_key: ' + str(access_token_key) print 'access_token_secret: ' + str(access_token_secret) print 'since_tweet_id: ' + str(since_tweet_id) print 'https_proxy: ' + str(https_proxy) return Option(keywords, verbose, hdfs_path, local_path, roll_size, consumer_key, consumer_secret, access_token_key, access_token_secret, since_tweet_id, https_proxy)
def calcul_delta(vectore_before, vector_after): before = {} after = {} s = 0 for k, v in vectore_before: before[k] = v for k, v in vector_after: after[k] = v for k in before: s = np.abs(vectore_before[k] - vector_after[k]) return s ############################################################################## if hadoopy.exists(temp_vector_path): hadoopy.rmr("-skipTrash %s"%temp_vector_path) copy(eigen_vector_tb_path, temp_vector_path) while diff>0.01: eigen_vector_before = load_eigen_vector(temp_vector_path) if hadoopy.exists(temp_vector_path): hadoopy.rmr("-skipTrash %s"%temp_vector_path) hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py') eigen_vector_after = load_eigen_vector(temp_vector_path)
#input_path="hdfs://localhost:9000/alice.txt" input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase" output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark' words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')] words_stop.append('') sc=SparkContext() lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8'))) splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop])) tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText})) tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b) tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList])) NwordsMax = 200000 def read_rdd(rdd): for key,data in rdd.takeSample(True,NwordsMax): yield key,data if hadoopy.exists(output_hdfs_path): hadoopy.rmr("-skipTrash %s"%output_hdfs_path) hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))
import hadoopy tb_path = "hdfs://localhost:9000/user/user/edge_list.tb" N = 64375 if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s" % tb_path) def read_edge_wiki(file_object): while True: line = file_object.readline().split() if not line: break yield (line[0].decode('utf-8'), 1.0 / N), [l.decode('utf-8') for l in line[1:]] #yield line[0].decode('utf-8'),line[1].decode('utf-8') def main(): with open('edge_list.txt') as f: hadoopy.writetb(tb_path, read_edge_wiki(f)) if __name__ == '__main__': main()