def good(self): base_path = make_random_str() for path in base_path, base_path + UNI_CHR: hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.exists(path)) hdfs.rmr(path) self.assertFalse(hdfs.path.exists(path))
def dump(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path) with hdfs.open(test_path) as fi: rdata = fi.read() fi.fs.close() self.assertEqual(rdata, self.data)
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug( 'Generated pipes_code:\n\n %s', self._generate_pipes_code() ) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' '))
def dump(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path, mode="wb") with hdfs.open(test_path) as fi: rdata = fi.read() fi.fs.close() self.assertEqual(rdata, self.data)
def write_output(k, args, logger): logger.debug("kinship matrix: shape=%r" % (k.shape, )) logger.info("serializing output") s = KinshipBuilder.serialize(k) logger.debug("serialized matrix: %d bytes" % len(s)) logger.info("writing output to %r" % (args.output, )) hdfs.dump(s, args.output, user=args.hdfs_user)
def _write(self, data): "Internal Write API" schema = self.schema wmaid = self.wmaid(data) year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) if not hdfs.path.isdir(hdir): hdfs.mkdir(hdir) fname = file_name(hdir, wmaid, self.compress) # create Avro writer and binary encoder writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() if self.compress: # use gzip'ed writer with BytesIO file object gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb') encoder = avro.io.BinaryEncoder(gzip_writer) else: # plain binary reader encoder = avro.io.BinaryEncoder(bytes_writer) # write records from given data stream to binary writer writer.write(data, encoder) # close gzip stream if necessary if self.compress: gzip_writer.flush() gzip_writer.close() # store raw data to hadoop via HDFS hdfs.dump(bytes_writer.getvalue(), fname) # close bytes stream bytes_writer.close()
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ self.logger.debug("remote_wd: %s", self.remote_wd) self.logger.debug("remote_exe: %s", self.remote_exe) self.logger.debug("remotes: %s", self.files_to_upload) if self.args.module: self.logger.debug('Generated pipes_code:\n\n %s', self._generate_pipes_code()) if not self.args.pretend: hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") self.logger.debug("created and chmod-ed: %s", self.remote_wd) pipes_code = self._generate_pipes_code() hdfs.dump(pipes_code, self.remote_exe) self.logger.debug("dumped pipes_code to: %s", self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) for (l, h, _) in self.files_to_upload: self.logger.debug("uploading: %s to %s", l, h) hdfs.cp(l, h) self.logger.debug("Created%sremote paths:" % (' [simulation] ' if self.args.pretend else ' '))
def write(writeFlag): if (writeFlag == True): # instantiate hadoop hdfs.hdfs() targetPath = config.targetPath; targetDirectory = config.targetDirectory; sourceFile = config.sourceFile; print("Target Path: " + targetPath); print("Target Directory: " + targetDirectory); print("Source Path: " + sourceFile); dumpFile = open(sourceFile, "r"); fullText = dumpFile.read(); dumpFile.close(); # write to hadoop #hdfs.mkdir(targetDirectory) hdfs.dump(fullText, targetPath) #hdfs.cp(sourceFile, targetPath) #print (hdfs.ls("test4")) #files = hdfs.ls("test4") # read from hadoop #hdfs.get("test4/hello.txt", "/tmp/hello.txt") #with open("/tmp/hello.txt") as f: # print f.read() #print(hdfs.ls("test", "hduser1")) #text = hdfs.load("test/hello.txt") #print text
def get(self): src = self.hdfs_paths[0] dest = hdfs.path.split(self.local_paths[0])[-1] hdfs.dump(self.data, src) hdfs.get(src, dest) with open(dest) as fi: rdata = fi.read() self.assertEqual(rdata, self.data)
def renames(self): test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path) new_d = hdfs.path.join(self.hdfs_wd, "new_dir") new_path = hdfs.path.join(new_d, "new_p") hdfs.renames(test_path, new_path) self.assertFalse(hdfs.path.exists(test_path)) self.assertTrue(hdfs.path.exists(new_path))
def renames(self): test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path, mode="wb") new_d = hdfs.path.join(self.hdfs_wd, "new_dir") new_path = hdfs.path.join(new_d, "new_p") hdfs.renames(test_path, new_path) self.assertFalse(hdfs.path.exists(test_path)) self.assertTrue(hdfs.path.exists(new_path))
def get(self): src = self.hdfs_paths[0] dest = hdfs.path.split(self.local_paths[0])[-1] hdfs.dump(self.data, src, mode="wb") hdfs.get(src, dest, mode="wb") with open(dest, 'rb') as fi: rdata = fi.read() self.assertEqual(rdata, self.data)
def rename(self): test_path = self.hdfs_paths[0] new_path = "%s.new" % test_path hdfs.dump(self.data, test_path, mode="wb") hdfs.rename(test_path, new_path) self.assertFalse(hdfs.path.exists(test_path)) self.assertTrue(hdfs.path.exists(new_path)) self.assertRaises(RuntimeError, hdfs.rename, test_path, self.local_paths[0])
def set_exe(self, pipes_code): """ Dump launcher code to the distributed file system. """ if not self.output: raise RuntimeError("no output directory, can't create launcher") parent = hdfs.path.dirname(hdfs.path.abspath(self.output.rstrip("/"))) self.exe = hdfs.path.join(parent, utils.make_random_str()) hdfs.dump(pipes_code, self.exe)
def __ls(self, ls_func, path_transform): for wd, paths in izip( (self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths) ): for p in paths: hdfs.dump(self.data, p) self.assertEqual(path_transform(ls_func(p)[0]), p) dir_list = [path_transform(p) for p in ls_func(wd)] self.assertEqual(set(dir_list), set(paths))
def rename(self): test_path = self.hdfs_paths[0] new_path = "%s.new" % test_path hdfs.dump(self.data, test_path) hdfs.rename(test_path, new_path) self.assertFalse(hdfs.path.exists(test_path)) self.assertTrue(hdfs.path.exists(new_path)) self.assertRaises( RuntimeError, hdfs.rename, test_path, self.local_paths[0] )
def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0], mode="wb") fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
def runTest(self): path = make_random_str() + UNI_CHR hdfs.dump("foo\n", path) st = hdfs.path.stat(path) atime, mtime = [getattr(st, 'st_%stime' % _) for _ in 'am'] new_atime, new_mtime = atime + 100, mtime + 200 hdfs.path.utime(path, (new_atime, new_mtime)) st = hdfs.path.stat(path) self.assertEqual(st.st_atime, new_atime) self.assertEqual(st.st_mtime, new_mtime) hdfs.rmr(path)
def __make_tree(self, wd): d1 = "%s/d1" % wd t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) hdfs.dump(self.data, f) t.add(f, 0) return t1
def run_phase_two(args, mappers, input_, logger): launcher_text = generate_launcher("bl.core.gt.mr.kinship.phase_two") launcher_name, mr_out_dir = random_str(args), random_str(args) logger.debug("launcher_name: %r" % (launcher_name, )) logger.debug("mr_out_dir: %r" % (mr_out_dir, )) hdfs.dump(launcher_text, launcher_name) mr_conf = MR_CONF.copy() mr_conf["mapred.map.tasks"] = mappers mr_conf["mapred.job.name"] = "kinship_phase_two" hadut.run_pipes(launcher_name, input_, mr_out_dir, properties=mr_conf) return mr_out_dir
def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0]) fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises( ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10 ) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
def create(self, path, mode, fi=None): '''Open a nonexistent file. This will just create a new file and generate a new filehandle for you.''' mode = mode & 1 maxfh = resource.getrlimit(resource.RLIMIT_NOFILE)[0] if len(self.filehandles) == maxfh: raise FuseOSError(errno.EMFILE) while self.fhmax in self.filehandles.keys(): self.fhmax = (self.fhmax+1)%maxfh hdfs.dump('',path) self.filehandles[self.fhmax] = self.hdfs.open_file(path, mode) return self.fhmax
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn, mode="wb") dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir, mode="wb") self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def __cp_file(self, wd): fn = "%s/fn" % wd hdfs.dump(self.data, fn) dest_dir = "%s/dest_dir" % wd hdfs.mkdir(dest_dir) fn_copy_on_wd = "%s/fn_copy" % wd hdfs.cp(fn, fn_copy_on_wd) self.assertEqual(hdfs.load(fn_copy_on_wd), self.data) self.assertRaises(IOError, hdfs.cp, fn, fn_copy_on_wd) fn_copy_on_dest_dir = "%s/fn" % dest_dir hdfs.cp(fn, dest_dir) self.assertEqual(hdfs.load(fn_copy_on_dest_dir), self.data) self.assertRaises(IOError, hdfs.cp, fn, dest_dir)
def test_kind(self): for path in self.path, self.u_path: self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): path = utils.make_random_str() self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_isdir(self): path = utils.make_random_str() self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_isdir(self): for path in self.path, self.u_path: self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def lowercase(): f = request.files['file'] f.save( os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename))) with open(f.filename, 'r') as fopen: hdfs.dump(fopen.read(), '/user/input_lowercase/text') os.system( "pydoop script --num-reducers 0 -t '' lowercase.py /user/input_lowercase /user/output_lowercase" ) list_files = hdfs.hdfs().list_directory('/user/output_lowercase') return json.dumps([ hdfs.load(file['name'], mode='rt') for file in list_files if 'SUCCESS' not in file['name'] ])
def chown(self): new_user = '******' test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path) hdfs.chown(test_path, user=new_user) path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], new_user) prev_owner = path_info['owner'] prev_grp = path_info['group'] # owner and group should remain unchanged hdfs.chown(test_path, user='', group='') path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], prev_owner) self.assertEqual(path_info['group'], prev_grp)
def wordcount(): f = request.files['file'] f.save( os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename))) with open(f.filename, 'r') as fopen: hdfs.dump(fopen.read(), '/user/input_wordcount/text') os.system( 'pydoop script -c combiner wordcount.py /user/input_wordcount /user/output_wordcount' ) list_files = hdfs.hdfs().list_directory('/user/output_wordcount') return json.dumps([ hdfs.load(file['name'], mode='rt') for file in list_files if 'SUCCESS' not in file['name'] ])
def chown(self): new_user = '******' test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path, mode="wb") hdfs.chown(test_path, user=new_user) path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], new_user) prev_owner = path_info['owner'] prev_grp = path_info['group'] # owner and group should remain unchanged hdfs.chown(test_path, user='', group='') path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], prev_owner) self.assertEqual(path_info['group'], prev_grp)
def __ls(self, ls_func, path_transform): for wd, paths in izip( (self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths) ): for p in paths: hdfs.dump(self.data, p) test_dir = "%s/%s" % (wd, "test_dir") test_path = "%s/%s" % (test_dir, "test_path") hdfs.dump(self.data, test_path) paths.append(test_dir) for recursive in False, True: if recursive: paths.append(test_path) dir_list = [path_transform(p) for p in ls_func(wd, recursive=recursive)] self.assertEqual(sorted(dir_list), sorted(paths))
def __ls(self, ls_func, path_transform): for wd, paths in czip((self.local_wd, self.hdfs_wd), (self.local_paths, self.hdfs_paths)): for p in paths: hdfs.dump(self.data, p, mode="wb") test_dir = "%s/%s" % (wd, "test_dir") test_path = "%s/%s" % (test_dir, "test_path") hdfs.dump(self.data, test_path, mode="wb") paths.append(test_dir) for recursive in False, True: if recursive: paths.append(test_path) dir_list = [ path_transform(p) for p in ls_func(wd, recursive=recursive) ] self.assertEqual(sorted(dir_list), sorted(paths))
def __make_tree(self, wd, root="d1", create=True): """ d1 |-- d2 | `-- f2 `-- f1 """ d1 = "%s/%s" % (wd, root) t1 = FSTree(d1) d2 = "%s/d2" % d1 t2 = t1.add(d2) if create: hdfs.mkdir(d2) for t, d, bn in ((t1, d1, "f1"), (t2, d2, "f2")): f = "%s/%s" % (d, bn) if create: hdfs.dump(self.data, f, mode="wb") t.add(f, 0) return t1
def dump(data, hdfs_path): """ Dumps data to a file Args: :data: data to write to hdfs_path :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). """ hdfs_path = _expand_path(hdfs_path, exists=False) return hdfs.dump(data, hdfs_path)
def __setup_remote_paths(self): """ Actually create the working directory and copy the module into it. Note: the script has to be readable by Hadoop; though this may not generally be a problem on HDFS, where the Hadoop user is usually the superuser, things may be different if our working directory is on a shared POSIX filesystem. Therefore, we make the directory and the script accessible by all. """ pipes_code = self.__generate_pipes_code() hdfs.mkdir(self.remote_wd) hdfs.chmod(self.remote_wd, "a+rx") hdfs.dump(pipes_code, self.remote_exe) hdfs.chmod(self.remote_exe, "a+rx") hdfs.put(self.args.module, self.remote_module) hdfs.chmod(self.remote_module, "a+r") self.__warn_user_if_wd_maybe_unreadable(self.remote_wd) self.logger.debug("Created remote paths:") self.logger.debug(self.remote_wd) self.logger.debug(self.remote_exe) self.logger.debug(self.remote_module)
def dump(data, hdfs_path): """ Dumps data to a file Args: :data: data to write to hdfs_path :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). """ #split = hdfs_path.split('/') #filename = split[len(split) - 1] #directory = "/".join(split[0:len(split)-1]) hdfs_path = _expand_path(hdfs_path, exists=False) return hdfs.dump(data, hdfs_path)
def samefile_rel(self): p = make_random_str() + UNI_CHR hdfs.dump("foo\n", p) self.assertTrue(hdfs.path.samefile(p, hdfs.path.abspath(p))) hdfs.rmr(p)
def setUp(self): self.path = make_random_str() + UNI_CHR hdfs.dump("foo\n", self.path)
def good(self): path = utils.make_random_str() hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.exists(path)) hdfs.rmr(path) self.assertFalse(hdfs.path.exists(path))
def load(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path) rdata = hdfs.load(test_path) self.assertEqual(rdata, self.data)
def dump(self, data, fname): "Dump given data directly to HDFS" hdfs.dump(data, fname)
import os import itertools import swiftclient import pydoop.hdfs as hdfs container = 'w251-enron' prefix = 'clean_v2' hdfs_prefix = '/enron' authurl = os.environ['SWIFT_AUTH_URL'] user = os.environ['SWIFT_USER'] key = os.environ['SWIFT_KEY'] conn = swiftclient.client.Connection( authurl=authurl, user=user, key=key) header, objects = conn.get_container(container, prefix=prefix, full_listing=True) hdfs.mkdir(hdfs_prefix) total = len(objects) count = 1 for obj in objects: name = obj['name'] print 'Downloading %s (%d of %d)' % (name, count, total) header, contents = conn.get_object(container, name) filename = name.replace('/', '_') hdfs.dump(contents, '%s/%s' % (hdfs_prefix, filename)) count += 1
def load(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path, mode="wb") rdata = hdfs.load(test_path) self.assertEqual(rdata, self.data)