def upload_to_hdfs(input_dir, output_dir, chunk_size): # locate files in directory files = [ os.path.abspath("{}/{}".format(input_dir, f)) for f in listdir(input_dir) if isfile(join(input_dir, f)) ] tmp_dir = "{}/tmp".format(input_dir) # setup temp dir if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) # split files into 128mb chunks for f in files: fs = FileSplit(file=f, splitsize=(chunk_size) * 1e6, output_dir=tmp_dir) fs.split(callback=split_callback) # upload to hdfs hdfs_client = InsecureClient("http://{}:9870".format( settings.HDFS_HOST_VALUE), user=settings.HDFS_USER_VALUE) # delete existing output dir if hdfs_client.content(output_dir, strict=False) != None: hdfs_client.delete(output_dir, recursive=True) # upload files to tmp dir remote_path = hdfs_client.upload(hdfs_path="/tmp", local_path=tmp_dir, n_threads=-1, overwrite=True) # rename to output_dir hdfs_client.rename("/tmp", output_dir) print( "{} files uploaded to hdfs host '{}{}' ({} file chunks total)".format( len(files), settings.HDFS_HOST_VALUE, output_dir, len(split_files), )) # delete temp files shutil.rmtree(tmp_dir) return hdfs_file_paths
def renameFiles(ip='172.20.10.2', port='9870', username='******', MainName='result-part', SubName='.json', dirPath='/tmp/Cathay/'): client = InsecureClient("http://" + ip + ":" + port, user=username) if dirPath[-1] != '/': dirPath += '/' fns = client.list(dirPath) for fn in fns: if 'part-' in fn: num = str(int(fn.split('part-')[-1]) + 1) client.rename(dirPath + fn, dirPath + MainName + num + SubName) return str(fns) + "\n Change to \n" + str(client.list(dirPath))
class HDFSLibrary: """ Test library for working with HDFS """ WEB_HDFS_URL = "" client = "" def __init__(self, namenode="localhost", port="50070"): self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port) print namenode, ">>", port, ">>", self.WEB_HDFS_URL self.client = InsecureClient(self.WEB_HDFS_URL) def check_hdfs_file_exists(self, file_path, stop=False): if None == self.client.status(file_path, strict=False): if stop: print "ERROR: Error: File does not exist: ", file_path return "ERROR: Error: File does not exist: ", file_path # exit(172) return False return True def get_hdfs_file_content(self, file_path): self.check_hdfs_file_exists(file_path, stop=True) data = "" with self.client.read(file_path) as reader: for line in reader: data += line return data def search_string_in_hdfs_file(self, file_path, text1, text2="aqwszx", text3="xzswqa"): ret = self.check_hdfs_file_exists(file_path, stop=True) found = "" if ret else ret with self.client.read(file_path) as reader: for line in reader: if line.find(text1) == -1 and line.find( text2) == -1 and line.find(text3) == -1: continue found += line return found def hdfs_file_should_not_contain(self, file_path, text1, text2="aqwszx", text3="xzswqa"): self.check_hdfs_file_exists(file_path, stop=True) with self.client.read(file_path) as reader: for line in reader: if line.find(text1) != -1 or line.find( text2) != -1 or line.find(text3) != -1: return False return True ######################## # # BASIC FUNCTIONS: # # ######################## def get_hdfs_file_folder_content_summary(self, file_path): """ Retrieving a file or folder content summary. :return: returns a file or folder content summary. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.content(file_path) def get_hdfs_file_folder_status(self, file_path): """ Retrieving a file or folder status. :return: returns a file or folder status. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.status(file_path) def list_hdfs_directory(self, folder_path): """ Listing all files inside a directory. :return: returns a file list. """ self.check_hdfs_file_exists(folder_path, stop=True) return self.client.list(folder_path) def move_hdfs_file(self, old_path, new_path): """ Renaming ("moving") a file. :return: NA """ self.check_hdfs_file_exists(old_path, stop=True) self.client.rename(old_path, new_path) def delete_hdfs_file(self, file_path): """ Deleting a file or folder recursively. :return: returns `True` if the deletion was successful otherwise `False` """ self.check_hdfs_file_exists(file_path) return self.client.delete(file_path, recursive=True) def copy_to_local_hdfs_file(self, hdfs_path, local_path): """ Copy a file or folder from HDFS to local. :return: local_path """ self.check_hdfs_file_exists(hdfs_path) return self.client.download(hdfs_path, local_path, overwrite=True, n_threads=4) def copy_from_local_hdfs_file(self, local_path, hdfs_path): """ Copy a file or folder from local to HDFS. :return: hdfs_path """ return self.client.upload(hdfs_path, local_path, overwrite=True, n_threads=4) def get_hdfs_file_checksum(self, file_path): """ Get the checksum value for file :return: checksum """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.checksum(file_path) def create_hdfs_dir(self, dir_path, perm=755): """ Create a directory or recursive dirs on HDFS :return: NA """ self.client.makedirs(dir_path, permission=perm)
class HadoopFileSystem(object): def __init__(self, *opts): self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER']) # def make_tree(self, datasourceid, client, path): # tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[]) # try: lst = client.list(path, status=True) # except: # pass #ignore errors # else: # for fsitem in lst: # fn = os.path.join(path, fsitem[0]) # if fsitem[1]['type'] == "DIRECTORY": # tree['children'].append(make_hdfs_tree(datasourceid, client, fn)) # else: # tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []}) # return tree def make_json(self, datasourceid, base, relative_path): path = os.path.join(base, relative_path) data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) } status = self.client.status(path, False) if status is not None: if status['type'] == "DIRECTORY": data_json['type'] = DataType.Folder data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)] else: data_json['type'] = DataType.File #print(json.dumps(data_json)) return data_json def makedirs(self, path): try: self.client.makedirs(path) except: return None return path def delete(self, path): try: if self.client.status(path, False) is not None: self.client.delete(path, True) except Exception as e: print(e) def addfolder(self, path): i = 0 while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None: i += 1 return self.makedirs(os.path.join(path, "New Folder ({0})".format(i))) def rename(self, oldpath, newpath): try: self.client.rename(oldpath, newpath) except Exception as e: print(e) def saveUpload(self, file, fullpath): localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) if os.path.isfile(localpath): os.remove(localpath) try: file.save(localpath) self.client.upload(os.path.dirname(fullpath), localpath, True) except: pass def download(self, fullpath): status = self.client.status(fullpath, False) if status is not None and status['type'] == "FILE": localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) return self.client.download(fullpath, localpath, True) else: return None
class HadoopFileSystem(): def __init__(self, url, user): u = urlsplit(url) if u.scheme != 'http' and u.scheme != 'https': raise ValueError("Invalid name node address") self.url = urlunparse((u.scheme, u.netloc, '', '', '', '')) self.client = InsecureClient(self.url, user=user) self.localdir = u.path self.prefix = 'HDFS' def normalize_path(self, path): path = os.path.normpath(path) path = self.strip_prefix(path) while path and path[0] == os.sep: path = path[1:] return os.path.join(self.localdir, path) def strip_prefix(self, path): return path[len(self.prefix):] if path.startswith( self.prefix) else path def strip_root(self, path): path = self.strip_prefix(path) if path.startswith(self.url): path = path[len(self.url):] if not path.startswith(self.localdir): raise 'Invalid hdfs path. It must start with the root directory' return path[len(self.localdir):] if path.startswith( self.localdir) else path def create_folder(self, path): try: path = self.normalize_path(path) self.client.makedirs(path) except: return None return path def remove(self, path): try: path = self.normalize_path(path) if self.client.status(path, False) is not None: self.client.delete(path, True) except Exception as e: print(e) def rename(self, oldpath, newpath): try: oldpath = self.normalize_path(oldpath) newpath = self.normalize_path(newpath) self.client.rename(oldpath, newpath) except Exception as e: print(e) def get_files(self, path): path = self.normalize_path(path) files = [] for f in self.client.list(path): status = self.client.status(join(path, f), False) if status['type'] != "DIRECTORY": files.append(f) return files def get_folders(self, path): path = self.normalize_path(path) folders = [] for f in self.client.list(path): status = self.client.status(join(path, f), False) if status['type'] == "DIRECTORY": folders.append(f) return folders def exists(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return not (status is None) def isdir(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return status['type'] == "DIRECTORY" def isfile(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return status['type'] == "FILE" def read(self, path): path = self.normalize_path(path) with self.client.read(path) as reader: return reader.read().decode('utf-8') def write(self, path, content): path = self.normalize_path(path) self.client.write(path, content) def make_json(self, path): normalized_path = self.normalize_path(path) data_json = { 'path': urljoin(self.url, normalized_path), 'text': os.path.basename(path) } status = self.client.status(normalized_path, False) if status is not None: data_json['folder'] = status['type'] == "DIRECTORY" if status['type'] == "DIRECTORY": data_json['nodes'] = [ self.make_json(os.path.join(path, fn)) for fn in self.client.list(normalized_path) ] #print(json.dumps(data_json)) return data_json def save_upload(self, file, fullpath): localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) if os.path.isfile(localpath): os.remove(localpath) try: file.save(localpath) if isfile(fullpath): fullpath = os.path.dirname(fullpath) self.client.upload(self.normalize_path(fullpath), localpath, True) except: pass def download(self, path): path = self.normalize_path(path) status = self.client.status(path, False) if status is not None and status['type'] == "FILE": localpath = os.path.join(tempfile.gettempdir(), os.path.basename(path)) return self.client.download(path, localpath, True) else: return None
print('Begin') # 1. Make a directory named: /activity1/ client.makedirs(hdfs_path='/activity1/', permission=None) client.makedirs(hdfs_path='/activity1/data/', permission=None) # 2. Put the file RandomText.txt into HDFS as the path: /activity1/data/RandomText.txt client.upload(hdfs_path='/activity1/data/', local_path='./RandomText.txt') # 3. List the contents of the directory /activity1/data/ print(client.list('/activity1/data')) # 4. Move the HDFS file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt client.rename('/activity1/data/RandomText.txt', '/activity1/data/NotSoRandomText.txt') with open('./RandomText.txt', 'r') as f: for line in f: temp = line # 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt client.write(hdfs_path='/activity1/data/NotSoRandomText.txt', data=temp, append=True) # 6. List the disk space used by the directory /activity1/data/ diskSpaceUsed = client.content('/activity1/data/', strict=True) print(diskSpaceUsed['spaceConsumed']) # 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt
def start_service(): global isRunning global stopRunning global currentFileToImport stopRunning = False isRunning = True while not stopRunning: # Connexion au cluster Cassandra cluster = Cluster(contact_points=['cassandra'], port=9042) session = cluster.connect() session.default_timeout = 10 # Création de la base de données si elle n'existe pas replicationFactor = 2 forceReplace = False db_name = "pjm" columnFamilyName = "estimated_load_hourly" create_database(session, db_name, columnFamilyName, replicationFactor, forceReplace) try: # Chemin de téléchargement du fichier des données hdfs_file_path = "/user/root/data/pjm" # Connexion au client HDFS client = InsecureClient(url='http://namenode:9870', user='******') # Création du dossier de stockage des fichiers traités client.makedirs(hdfs_file_path + '/imported') # Récupération de la liste des fichiers à traiter files = client.list(hdfs_file_path, status=True) #print(files) #print(client.parts(hdfs_file_path)) # Traitement des fichiers for pjm_file, filestatus in files: # Mise à jour du nom du fichier pour le status currentFileToImport = pjm_file print(pjm_file) if filestatus['type'] == 'FILE': # Lecture en mémoire du fichier with client.read(hdfs_file_path + "/" + pjm_file, encoding='utf-8') as reader: df = pd.read_csv(reader, sep=',', header='infer') # ----- Transformation initiale du Dataframe ----- # Suppression des colonnes inutiles df = df.drop(columns=[ 'datetime_beginning_ept', 'datetime_beginning_utc', 'datetime_ending_utc' ]) # Suppression des observations en doublon => conservation de la 1e occurence df = df.drop_duplicates( subset=['datetime_ending_ept', 'load_area'], keep='first') # Formattage de la date de l'observation df['datetime_measure'] = df.apply( lambda x: pd.to_datetime( x[['datetime_ending_ept']], format='%m/%d/%Y %I:%M:%S %p'), axis=1) df['datetime_measure'] = df['datetime_measure'].astype( 'datetime64[ns]') df = df.drop(columns=['datetime_ending_ept']) # Import des données brutes dans Cassandra (table "estimated_load_hourly" ) #for i in df.index: #request_insert = "INSERT INTO " + columnFamilyName + " (datetime_ending_ept, load_area, estimated_load) " \ # + " VALUES ('" + str(df['datetime_measure'][i]) + "','" + df['load_area'][i] + "', " + str(df['estimated_load_hourly'][i]) + ");" #print(request_insert) #session.execute(request_insert) # ----- Nettoyage des données --------- # Objectif : Conservation du total de la consommation par heure # Suppression de la colonne de la zone df = df.drop(columns=['load_area']) # Calcul du total de consommation par heure df = df.groupby(by=['datetime_measure']).sum() # Ajout de colonnes complémentaires à la date de la mesure df['date'] = df.index.date df['annee'] = df.index.year df['mois'] = df.index.month df['semaine'] = df.index.isocalendar().week df['heure'] = df.index.hour df['jour_annee'] = df.index.dayofyear df['trimestre'] = df.index.quarter df['jour_semaine'] = df.index.dayofweek df['jour_mois'] = df.index.day # Ajout des très rares heures manquantes pour respecter la fréquence des observations checkmonth = pd.DataFrame( df.groupby(['jour_annee', 'annee'])['annee'].count()) checkmonth.rename(columns={'annee': 'nb'}, inplace=True) df_missing = checkmonth[checkmonth.nb != 24] df_missing = df_missing.reset_index() Hour0_24 = pd.DataFrame(np.arange(24)) df_to_append = pd.DataFrame() for x, y in zip(df_missing['jour_annee'], df_missing['annee']): print("Jour avec des heures manquantes :", x, y) df_encours = df[(df.jour_annee == x) & (df.annee == y)] h_missing = Hour0_24[~Hour0_24[0].isin(df_encours. heure)] h = h_missing.iloc[0].name df_to_append = df_to_append.append( df[(df.jour_annee == x) & (df.annee == y) & (df.heure == (h + 1))]) df_to_append.heure = df_to_append.heure - 1 df = df.append(df_to_append) # Insertion des données nettoyées dans CASSANDRA (table "estimated_load_hourly_summary" ) for label, row in df.iterrows(): print(label) request_insert = "INSERT INTO " + columnFamilyName + "_summary " \ + " (datetime_est_load " \ + " ,date_est_load " \ + " ,annee " \ + " ,mois " \ + " ,semaine " \ + " ,heure " \ + " ,trimestre " \ + " ,jour_annee " \ + " ,jour_semaine " \ + " ,jour_mois " \ + " ,total_estimated_load " \ + " ) " \ + " VALUES ('" + str(label) + "' " \ + " ,'" + str(row['date']) + "' " \ + " ," + str(row['annee']) + " " \ + " ," + str(row['mois']) + " " \ + " ," + str(row['semaine']) + " " \ + " ," + str(row['heure']) + " " \ + " ," + str(row['trimestre']) + " " \ + " ," + str(row['jour_annee']) + " " \ + " ," + str(row['jour_semaine']) + " " \ + " ," + str(row['jour_mois']) + " " \ + " ," + str(row['estimated_load_hourly']) + " );" print(request_insert) session.execute(request_insert) # Déplacement du fichier traités dans le dossier "imported" client.rename(hdfs_file_path + "/" + pjm_file, hdfs_file_path + '/imported/' + pjm_file) except HdfsError as ex: # Traitement des erreur HDFS print(str(ex)) # Fin des fichiers à traiter currentFileToImport = "" #closing Cassandra connection session.shutdown() # Attendre avant de vérifier la présence d'autres fichiers time.sleep(30) return ""
class WebHDFSStore(): ''' A file store based on the WebHDFS protocol. ''' # Set a refresh-date to indicate when we did this lookup: refresh_date = datetime.datetime.utcnow().isoformat( timespec='milliseconds') + 'Z' def __init__(self, service_id, user_override=None): self.service_id = service_id self.webhdfs_url = HADOOPS[service_id]['webhdfs_url'] self.webhdfs_user = HADOOPS[service_id]['webhdfs_user'] if user_override: self.webhdfs_user = user_override self.id_prefix = HADOOPS[service_id]['id_prefix'] self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user) def put(self, local_path, hdfs_path, backup_and_replace=False): # Get the status of the destination: dest_status = self.client.status(hdfs_path, strict=False) # Handle files or directories: if os.path.isfile(local_path): hdfs_path = self._combine_paths(dest_status, local_path, hdfs_path) self._upload_file(local_path, hdfs_path, backup_and_replace) elif os.path.isdir(local_path): # TODO, if it's a directory raise Exception( "Cannot upload anything other than single files at this time!") else: raise Exception("Unknown path type! Can't handle %s" % local_path) def _combine_paths(self, dest_status, local_path, hdfs_path): # If the hdfs_path is a directory, combine the paths: if dest_status and dest_status['type'] == 'DIRECTORY': combined_path = psp.join(hdfs_path, local_path) logger.info("Using combined path: %s" % combined_path) return combined_path else: # Otherwise, just return the path: return hdfs_path def _upload_file(self, local_path, hdfs_path, backup_and_replace=False): """ Copy up to HDFS, making it suitably atomic by using a temporary filename during upload. :return: None """ # Set up flag to record outcome: success = False # Calculate hash of local file: logger.info("Calculating hash of %s" % local_path) if not os.path.isfile(local_path): raise Exception("Cannot upload %s - individual files only!") local_hash = calculate_sha512_local(local_path) logger.info("Local %s hash is %s " % (local_path, local_hash)) # # TODO Allow upload to overwrite truncated files? # # Check if the destination file exists: already_exists = self.exists(hdfs_path) if already_exists and not backup_and_replace: logger.warning( "Path %s already exists! No upload will be attempted." % hdfs_path) else: # Upload to a temporary path: tmp_path = "%s_temp_" % hdfs_path # Now upload the file, allowing overwrites as this is a temporary file and # simultanous updates should not be possible: logger.info("Uploading as %s" % tmp_path) with open(local_path, 'rb') as reader, self.client.write( tmp_path, overwrite=True) as writer: while True: data = reader.read(10485760) if not data: break writer.write(data) # If set, backup-and-replace as needed: if backup_and_replace and already_exists: date_stamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H-%M-%S') backup_path = "%s.bkp_%s" % (hdfs_path, date_stamp) logger.warning("Renaming %s to %s..." % (hdfs_path, backup_path)) self.client.rename(hdfs_path, backup_path) # Move the uploaded file into the right place: logger.info("Renaming %s to %s..." % (tmp_path, hdfs_path)) self.client.rename(tmp_path, hdfs_path) # Give the namenode a moment to catch-up with itself and then check it's there: # FIXME I suspect this is only needed for our ancient HDFS time.sleep(2) status = self.client.status(hdfs_path) logger.info("Calculating hash of HDFS file %s" % hdfs_path) hdfs_hash = self.calculate_sha512(hdfs_path) logger.info("HDFS %s hash is %s " % (hdfs_path, hdfs_hash)) if local_hash != hdfs_hash: raise Exception("Local & HDFS hashes do not match for %s" % local_path) else: logger.info("Hashes are equal!") success = True # Log successful upload: logger.warning("Upload completed for %s" % hdfs_path) # And return success flag so caller knows it worked: return success def move(self, local_path, hdfs_path): # Perform the PUT first: success = self.put(local_path, hdfs_path) # And delete the local file if that worked: if success == True: os.remove(local_path) def calculate_sha512(self, path): ''' Calculate the SHA512 hash of a single file on HDFS ''' with self.client.read(path) as reader: file_hash = calculate_reader_hash(reader, path) return file_hash def _to_info(self, path, status): # Add the file path: status['file_path'] = path # Classify based on HDFS storage conventions: item = HdfsPathParser(status).to_dict() # Work out the permissions string: if status['permission'].isnumeric(): permissions = permissions_octal_to_string(int( status['permission'])) if status['type'] == 'DIRECTORY': permissions = "d" + permissions else: permissions = "-" + permissions else: permissions = status['permission'] # Defined fields based on directory/file status if permissions[0] == 'd': fs_type = 'directory' access_url = '%s/webhdfs/v1%s?op=LISTSTATUS&user.name=%s' % ( self.webhdfs_url, item['file_path'], self.webhdfs_user) else: fs_type = 'file' access_url = '%s/webhdfs/v1%s?op=OPEN&user.name=%s' % ( self.webhdfs_url, item['file_path'], self.webhdfs_user) # And return as a 'standard' dict: return { 'id': '%s%s' % (self.id_prefix, item['file_path']), 'refresh_date_dt': self.refresh_date, 'file_path_s': item['file_path'], 'file_size_l': item['file_size'], 'file_ext_s': item['file_ext'], 'file_name_s': item['file_name'], 'permissions_s': permissions, 'hdfs_replicas_i': item['number_of_replicas'], 'hdfs_user_s': item['user_id'], 'hdfs_group_s': item['group_id'], 'modified_at_dt': "%sZ" % item['modified_at'], 'timestamp_dt': "%sZ" % item['timestamp'], 'year_i': item['timestamp'][0:4], 'recognised_b': item['recognised'], 'kind_s': item['kind'], 'collection_s': item['collection'], 'stream_s': item['stream'], 'job_s': item['job'], 'layout_s': item['layout'], 'hdfs_service_id_s': self.service_id, 'hdfs_type_s': fs_type, 'access_url_s': access_url } def list(self, path, recursive=False): # Handle non-existant entry, or a file: path_status = self.client.status(path, strict=False) if path_status is None: raise Exception("No such file or directory: %s" % path) elif path_status['type'] == 'FILE': # Plain old file: yield self._to_info(path, path_status) else: # Handle folders: if recursive: for dir_info, dirs_info, files_info in self.client.walk( path, status=True): dir_path, dir_status = dir_info for file_name, file_status in files_info: file_path = psp.join(dir_path, file_name) yield self._to_info(file_path, file_status) else: for file_name, file_status in self.client.list(path, status=True): file_path = psp.join(path, file_name) yield self._to_info(file_path, file_status) def exists(self, path): status = self.client.status(path, strict=False) if status: return True else: return False def rm(self, path): # And delete from HDFS (usually prevented by API proxy) # Hard-coded to never act recursively - if you want that, do it manually via the back-end. self.client.delete(path, recursive=False) def stream(self, path, offset=0, length=None): # NOTE our WebHDFS service is very old and uses 'len' not 'length' for controlling the response length: # The API proxy we use attempts to remedy this by mapping any 'length' parameter to 'len'. return self.client.read(path, offset=offset, length=length) def read(self, path, offset=0, length=None): with self.stream(path, offset, length) as reader: while True: data = reader.read(10485760) if not data: break yield data def lsr_to_items(self, reader): """ This task processes a raw list of files generated by the hadoop fs -lsr command. As this can be a very large list, it avoids reading it all into memory. It parses each line, and yields a suitable stream of parsed objects matching the WebHDFS API. """ for line in reader: if "lsr: DEPRECATED: Please use 'ls -R' instead." in line: logger.warning(line) else: permissions, number_of_replicas, userid, groupid, filesize, modification_date, modification_time, filename = line.split( None, 7) filename = filename.strip() timestamp = datetime.datetime.strptime( '%s %s' % (modification_date, modification_time), '%Y-%m-%d %H:%M') info = { 'permission': permissions, 'replication': number_of_replicas, 'owner': userid, 'group': groupid, 'length': filesize, 'modificationTime': timestamp.timestamp() * 1000, 'pathSuffix': filename } # Skip directories: if permissions[0] != 'd': yield self._to_info(filename, info) info['type'] = 'DIRECTORY' else: info['type'] = 'FILE'