def on_moved(self, event): """ for non-empty dir: the db records of its files and subdirectories are also modified the dir file structure does not exist locally anymore so need to find it through calls to HDFS to update all related db records (local + remote path) for link: the corresponding remote file should be moved + the path in the link file should be changed """ print("on_moved") try: rem_src_path = self.lc.get_remote_file_path(event.src_path) tmp = customize_path( self.local.remotePath, remove_prefix(self.local.localPath, event.dest_path)) rem_dest_path = rm_link_extension(tmp) existing_main_obj = MRBoxObject(event.dest_path, self.local.localFileLimit, rem_src_path) if rem_src_path is not None: if existing_main_obj.is_dir(): # get all the remote paths of the files in the dir remote_src_paths = self.hadoop.find_remote_paths( rem_src_path) local_remote_tuples = [] for rp in remote_src_paths: if rp == rem_src_path: local_remote_tuples.append( (rp, event.dest_path, rem_dest_path)) else: file_hierarchy = remove_prefix(rem_src_path, rp) new_remote_path = customize_path( rem_dest_path, file_hierarchy) loc_type = self.lc.get_loc_type_by_remote_path(rp) new_local_path = to_link( customize_path(event.dest_path, file_hierarchy), loc_type) local_remote_tuples.append( (rp, new_local_path, new_remote_path)) # modify links' content existing_obj = MRBoxObject( new_local_path, self.local.localFileLimit, rp) existing_obj.replace_loc_content(new_remote_path) self.lc.update_by_remote_path(local_remote_tuples) else: existing_main_obj.replace_loc_content(rem_dest_path) self.lc.update_by_remote_path([ (rem_src_path, event.dest_path, rem_dest_path) ]) self.hadoop.mv(rem_src_path, rem_dest_path) except FileNotFoundError: print("Move already handled!")
def find_remote_paths(self, starting_path): """ :param starting_path: the remote path of the file / dir that was deleted locally :param hdfs: connection to hdfs :return: list of remote paths of dirs + files in the file structure from starting_path (without /starting_path/) """ print("find_remote_paths") list_of_paths = [starting_path] for sp, subdir, files in self.walk(starting_path): for name in subdir: list_of_paths.append(customize_path(sp, name)) for name in files: list_of_paths.append(customize_path(sp, name)) return list_of_paths
def create_locally_synced_dir(self, cmd, lc, mrbox_dir): # ok! """ Creates a dir on hdfs by running the cmd command and creates a copy of it locally :param cmd: bash command to create hdfs dir :param lc: sqlite3 db class instance :param mrbox_dir: MRBox file object with the info regarding the dir that will be created locally + on HDFS :return: """ # create on hdfs --> tracked file: put on db --> create locally print("create locally synced dir") subprocess.run(cmd, shell=True, check=True) hdfs_chk = hdfs_file_checksum(self.hadoopPath, mrbox_dir.remotePath, mrbox_dir.remoteType) lc.insert_tuple_hdfs(mrbox_dir.localPath, mrbox_dir.remotePath, hdfs_chk, mrbox_dir.localType) os.mkdir( mrbox_dir.localPath ) # creates an empty directory of hdfs outputs locally, triggers on_created() print("folder created") for rp in self.ls(mrbox_dir.remotePath): hdfs_chk = hdfs_file_checksum(self.hadoopPath, rp, 'file') file_size = hdfs_file_size(self.hadoopPath, rp) f = remove_prefix(mrbox_dir.remotePath, rp) lp = customize_path(mrbox_dir.localPath, f) mrbox_file = MRBoxObject(lp, mrbox_dir.localFileLimit, rp, file_size, 'file') # todo: insert in batch lc.insert_tuple_hdfs(mrbox_file.localPath, mrbox_file.remotePath, hdfs_chk, mrbox_file.localType) mrbox_file.file_info() self.get(mrbox_file)
def on_created(self, event): """ Creates dir / file on HDFS & adds mapping with mapping between local + hdfs path in the local db If created file is .yaml issues a MR job""" print("on_created") if self.lc.check_local_path_exists(event.src_path): print("file/dir already exists on hdfs - mapped on db") remote_file_path = self.lc.get_remote_file_path(event.src_path) obj = MRBoxObject(event.src_path, self.local.localFileLimit, remote_file_path) # do we want remote file size? # obj.file_info() # update needed to insert the loc_chk in existent db record # in case of link: loc_chk != hdfs_chk loc_chk = crc32c_file_checksum(obj.localPath, obj.localType) self.lc.update_tuple_local(obj.localPath, loc_chk) else: print("file/dir needs to be created on hdfs - not mapped on db") filename = remove_prefix(self.local.localPath, event.src_path) remote_file_path = customize_path(self.local.remotePath, filename) obj = MRBoxObject(event.src_path, self.local.localFileLimit, remote_file_path) # obj.file_info() loc_chk = crc32c_file_checksum(obj.localPath, obj.localType) self.lc.insert_tuple_local(obj.localPath, obj.remotePath, loc_chk, obj.localType) if not self.hadoop.exists(remote_file_path) and obj.is_dir(): print("creating dir on hdfs") self.hadoop.mkdir(remote_file_path) hdfs_chk = hdfs_file_checksum(self.hadoop.hadoopPath, obj.remotePath, obj.localType) self.lc.update_tuple_hdfs(obj.localPath, hdfs_chk) if not self.hadoop.exists(remote_file_path) and obj.is_file(): print("creating file on hdfs") self.hadoop.put(obj.localPath, obj.remotePath) hdfs_chk = hdfs_file_checksum(self.hadoop.hadoopPath, obj.remotePath, obj.localType) self.lc.update_tuple_hdfs(obj.localPath, hdfs_chk) # if it is a link, it already exists # compare_local_hdfs_copy(self.lc, event.src_path) if obj.is_file() and event.src_path.endswith('.yaml'): self.issue_mr_job(obj.localPath)
def hdfs_file_size(hadoop_path, hdfs_filepath): # todo: how to handle dirs """ Returns the size of a hadoop file in bytes :param hadoop_path: :param hdfs_filepath: :return: """ cmd_hdfs_file_size = customize_path( hadoop_path, 'bin/hdfs') + " dfs -ls " + hdfs_filepath res = subprocess.run(cmd_hdfs_file_size, shell=True, check=True, capture_output=True, text=True) res = res.stdout file_size = res.split()[4] print("HDFS file size in bytes: " + file_size) return int(file_size)
def issue_mr_job(self, filepath): """ Called when a .yaml file is created. Reads the paths of mapper, reducer, input dir, output dir + checks that they exist locally + remotely. Issues the MR job. :param filepath: the path of the created yaml file, all specified paths are local :return: """ print("issue_mr_job") with open(filepath, 'r') as f: data = yaml.load(f, Loader=yaml.FullLoader) mapper_path = data.get('mapper') reducer_path = data.get('reducer') input_path = customize_path(self.local.localPath, data.get('input')) print("input_path: " + input_path) output_path = data.get('output') # check if the files exists locally for f in [mapper_path, reducer_path, input_path]: if not os.path.exists(f): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f) # to issue MR job, the input file should be on hdfs --> need to get the remote path hdfs_input_path = self.lc.get_remote_file_path( customize_path(self.local.localPath, input_path)) print("hdfs_input_path: " + hdfs_input_path) # need to generate local + remote output paths local_output_path = customize_path(self.local.localPath, output_path) hdfs_output_path = customize_path(self.local.remotePath, output_path) # issue MR job cmd_mr = customize_path(self.hadoop.hadoopPath, 'bin/hadoop') + " jar " \ + customize_path(self.hadoop.hadoopPath, 'share/hadoop/tools/lib/hadoop-streaming-3.2.0.jar') \ + " -files " + mapper_path + "," + reducer_path + " -mapper 'mapper.py'" + " -reducer 'reducer.py'" \ + " -input " + hdfs_input_path + " -output " + hdfs_output_path try: output_dir = MRBoxObject(local_output_path, self.local.localFileLimit, hdfs_output_path, remote_file_type='dir') self.hadoop.create_locally_synced_dir(cmd_mr, self.lc, output_dir) except subprocess.CalledProcessError as e: print("Map-Reduce job failed!") print(e.output)
def hdfs_file_checksum(hadoop_path, hdfs_filepath, ftype): """ Computes the checksum of a file on hdfs :param hadoop_path: where hadoop is installed locally :param hdfs_filepath: the path of the file on hdfs :param ftype: the type of the local copy of the file ('dir', 'file', 'link') :return: """ if ftype == 'dir': return None cmd_hdfs_chk = customize_path(hadoop_path, 'bin/hdfs') + \ " dfs -Ddfs.checksum.combine.mode=COMPOSITE_CRC -checksum " + hdfs_filepath res = subprocess.run(cmd_hdfs_chk, shell=True, check=True, capture_output=True, text=True) res = res.stdout prefix = hdfs_filepath + "\t" + "COMPOSITE-CRC32C\t" return res[len(prefix):].rstrip("\n")
app_name = "mrbox" config_file = app_name + ".conf" config_folder = os.path.dirname(os.path.realpath(__file__)) config_filepath = os.path.join(config_folder, config_file) # check if a configuration file exists in current path if not os.path.exists(config_filepath): print("No .conf file is found in %s." % config_folder) sys.exit(1) # read from mrbox.conf config = configparser.ConfigParser() config.read(config_filepath) # local folder properties local_folder = customize_path(config['User']['localPath'], 'mrbox') local_file_size_limit_MB = config['User']['localFileSizeMB'] remote_folder = customize_path(config['User']['hdfsPath'], 'mrbox') if not os.path.exists(local_folder): os.mkdir(local_folder) local_file_size_limit_bytes = bytes_to_mb(int(local_file_size_limit_MB)) local = MRBoxObject(local_folder, local_file_size_limit_bytes, remote_folder) # connect to hdfs and create hadoop interface, todo: check how to create list of multiple hadoops hdfs_con = HDFileSystem(host=config['User']['hdfsHost'], port=config['User'].getint('hdfsPort')) hadoop_path = config['User']['hadoopPath'] hdfs_con.mkdir(remote_folder) hadoop = HadoopInterface(hdfs_con, hadoop_path)
def tail(self, hdfs_path): cmd = customize_path(self.hadoopPath, 'bin/hdfs') + " dfs -tail " + hdfs_path subprocess.run(cmd, shell=True, check=True)
def main(argv): app_name = "mrbox" config_file = app_name + ".conf" config_folder = os.path.dirname(os.path.realpath(__file__)) config_filepath = os.path.join(config_folder, config_file) # check if a configuration file exists in current path if not os.path.exists(config_filepath): print("No .conf file is found in %s." % config_folder) sys.exit(1) # parse arguments if len(argv) == 0 or len(argv) > 2: print( "Wrong number of operands.\nTry 'mrview.py help' for more information." ) sys.exit(1) elif argv[0] == 'help': print("mrview.py cmd absolute_file_path") print("Supported link commands: ", *SUPPORTED_LINK_CMDS, sep=',') sys.exit(1) cmd = argv[0] file_path = argv[1] # read from mrbox.conf config = configparser.ConfigParser() config.read(config_filepath) local_folder = customize_path(config['User']['localPath'], 'mrbox') local_path = customize_path(local_folder, file_path) if not os.path.exists(local_path): print("File does not exist in ", local_folder) sys.exit(1) # connect to hdfs and create hadoop interface hdfs_con = HDFileSystem(host=config['User']['hdfsHost'], port=config['User'].getint('hdfsPort')) hadoop_path = config['User']['hadoopPath'] hadoop = HadoopInterface(hdfs_con, hadoop_path) # create sqlite db instance full_db_path = os.path.join(config['User']['localPath'], config['User']['dbFile']) lc = LocalCatalog(full_db_path) # need to query db to get type and remote path if link hdfs_path = lc.get_remote_file_path(local_path) loc_type = lc.get_loc_type_by_remote_path(hdfs_path) # if link, only the supported cmds can be executed on HDFS copy # if dir or file, UNIX cmds to be executed locally if loc_type == 'link': if cmd not in SUPPORTED_LINK_CMDS: print( cmd, " not supported for links.\nTry 'mrview.py help' for more information." ) sys.exit(1) elif cmd == 'head': hadoop.head(hdfs_path) elif cmd == 'tail': hadoop.tail(hdfs_path) else: os.system(cmd + ' ' + local_path)