def distCp(clusters, src, dest): """ create the dist command bash$ hadoop distcp hdfs://nn1:8020/foo/a \ hdfs://nn1:8020/foo/b \ hdfs://nn2:8020/bar/foo Through HDFS proxy (httpfs) e.g. hadoop distcp webhdfs://nn1:3888/gutenberg/ webhdfs://nn2:3888/ """ start = time.time() cmd = "hadoop distcp %s %s" % (src, dest) rcode, stdout, stderr = _checked_hadoop_fs_command(cmd) end = time.time() span = end - start from_cluster, to_cluster = clusters # update network data totalsize = get_total_size(src) print "(Total size of data transfered: %s)" % totalsize if totalsize > 0 and span > 0: logline = "%s:%s:%s:%s" % (from_cluster, to_cluster, totalsize, span) network_filewriteDataTime(logline) return stdout.rstrip()
def get_queue_info(): """ get info of the queue """ cmd = "mapred queue -list" rcode, stdout, stderr = _checked_hadoop_fs_command(cmd) return stdout
def hdfs_putf(local_path, hdfs_path): cmd = "hadoop fs -put -f %s %s" % (local_path, hdfs_path) rcode, stdout, stderr = _checked_hadoop_fs_command(cmd)
def hdfs_rmr(path): cmd = "hadoop fs -rm -r %s" % (path) rcode, stdout, stderr = _checked_hadoop_fs_command(cmd)
def hdfs_du(path): cmd = "hadoop fs -du -s %s" % (path) rcode, stdout, stderr = _checked_hadoop_fs_command(cmd) if stdout: return stdout.split(' ')[0]
def hdfs_mkdirp(path): cmd = "hadoop fs -mkdir -p %s" % (path) rcode, stdout, stderr = _checked_hadoop_fs_command(cmd)