class Storage: def __init__(self, protocol: str = 'webHDFS', *args, **kwargs): self.protocol, self.client = protocol.lower(), None if protocol.lower() == 'webHDFS'.lower(): from hdfs import InsecureClient self.client = InsecureClient(*args, **kwargs) for f in 'upload download list status delete'.split(): setattr(self, f, getattr(self, '%s_%s' % (f, protocol.lower()))) def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs): to_screen("upload %s -> %s" % (local_path, remote_path)) return self.client.upload(local_path=local_path, hdfs_path=remote_path, **kwargs) def download_webhdfs(self, remote_path: str, local_path: str, **kwargs): mkdir_for(local_path) to_screen("download %s -> %s" % (remote_path, local_path)) return self.client.download(local_path=local_path, hdfs_path=remote_path, overwrite=True, **kwargs) def list_webhdfs(self, remote_path: str, **kwargs): return self.client.list(hdfs_path=remote_path, **kwargs) def status_webhdfs(self, remote_path: str, **kwargs): return self.client.status(hdfs_path=remote_path, **kwargs) def delete_webhdfs(self, remote_path: str, **kwargs): return self.client.delete(hdfs_path=remote_path, **kwargs)
def get_hdfs_max_date(): SUCCESS_FILE = f"{PARQUET_FILE}/_SUCCESS" try: client = InsecureClient('http://namenode:9870', user='******') time_ts = client.status(SUCCESS_FILE)["modificationTime"] / 1000 return date.fromtimestamp(time_ts) except Exception: log("Exception while trying to get parquet max date") log(traceback.format_exc()) return DEFAULT_DATE
class interHDFS: def __init__(self, url, user=None, **kwargs): self.url = url self.user = user for k, v in kwargs.items(): self.k = v self.connect = InsecureClient(self.url, self.user) try: self.connect.status('/') except Exception as e: print(f"[ERROR]:") raise ("connected failed!") @property def apiVersion(self): return "v1" def listDir(self, dirname: str = '/'): return self.connect.list(dirname) def getFiles(self, dirname: str, depth: int = 0) -> list: l = [] if not dirname: print("dirname is null") else: for file in self.connect.walk(dirname, depth=depth): if file[-1]: for f in file[-1]: l.append(file[0] + '/' + f) return l def downloadToCsv(self, filename: str) -> None: '''only split for the '€€' sign, and generate same filename in current directory''' with self.connect.read(filename, encoding='utf-8') as reader: with open(csvdir + filename.split('/')[-1].split('.')[0] + '.csv', 'a+') as cf: for line in reader.readlines(): newline = line.replace('€€', ',') cf.write(newline)
def get(self): # Récupération du Dataset pour l'évaluation df = get_data_cassandra() print(df.head()) X = df['total_estimated_load'].values # evaluate parameters (p,d,q) <=> (AR, I, MA) p_values = 7 d_values = 0 q_values = 5 #best_cfg, best_score = evaluate_models(X, p_values, d_values, q_values) best_cfg = (p_values,d_values,q_values) # Entrainement du meilleur modèle model = ARIMA(X, order=best_cfg) model_fit = model.fit() # save model if not os.path.exists(model_local_path): # Création du dossier d'export local qui n'existe pas os.makedirs(model_local_path,exist_ok=False) model_fit.save(model_local_path + model_name) # Connexion au client HDFS client = InsecureClient(url='http://namenode:9870', user='******') # Création du dossier de stockage des fichiers traités if client.status(model_hdfs_remote_path,strict=False) == None: client.makedirs(model_hdfs_remote_path) # Copie du modèle sur HDFS remote_load_path = client.upload(model_hdfs_remote_path, model_local_path + model_name,overwrite=True) #print(remote_load_path) print(client.list(model_hdfs_remote_path)) return { 'best_cfg': best_cfg , 'status': 'Terminated'}
def get(self,period): print("Period to predict : ",period) # Connexion au client HDFS client = InsecureClient(url='http://namenode:9870', user='******') # Vérification de la présence du modèle sauvegardé sur HDFS if client.status(model_hdfs_remote_path + model_name , strict=False) != None: # load model client.download(model_hdfs_remote_path+model_name, model_local_path, overwrite=True) model_fit = ARIMAResults.load(model_local_path + model_name) # Dataset pour l'évaluation df = get_data_cassandra() print(df.head()) X = df['total_estimated_load'].values start_index = len(X) end_index = start_index + int(period) forecast = model_fit.predict(start=start_index, end=end_index) #df['date_est_load'] = df['date_est_load'].apply(pd.Timestamp) day = df['date_est_load'].values[-1].date() print(day) print(type(day)) day += datetime.timedelta(days=1) res = {} for yhat in forecast: res[day.strftime("%d/%m/%Y")] = yhat day += datetime.timedelta(days=1) return res return "Service has been stopped"
def on_data(self, data): try: if self.count <= 10000000: with open(self.outfile, 'a+') as f: f.write(data) self.count += len(data) return True else: hdfs_path = '/team40/stream_data/' + time.strftime( '%Y-%m-%d_%H-%M', time.localtime()) + self.outfile client = InsecureClient('http://115.146.86.32:50070', user='******') client.upload(hdfs_path, self.outfile) print(client.status(hdfs_path, strict=False)) self.count = 0 with open(self.outfile, 'w') as f: f.write(data) self.count += len(data) return True except BaseException as e: print("Error on_data: %s" % str(e)) return True
def generate_random_tensor_data_hdfs( all_tensors_config, cardinalities, tensor_name, zero_based_indices=False, hdfs_url='http://spark-master0-dsl05:50070', hdfs_user='******'): # generate tensor data on local file hdfs_filename = os.path.join(gctf_data_path_no_url, tensor_name + '.csv') print('generate_random_tensor_data_hdfs: generating %s' % hdfs_filename) client = InsecureClient(hdfs_url, user=hdfs_user) assert client.status( hdfs_filename, strict=False ) is None, 'data file %s exists can not procede' % hdfs_filename with client.write(hdfs_filename, encoding='utf-8') as writer: write_header(all_tensors_config, tensor_name, writer) iter_indices_gen_data(all_tensors_config, cardinalities, tensor_name, writer, zero_based_indices) # fd.close() # TODO: hdfs client api does not specify close? all_tensors_config[tensor_name]['hdfs_filename'] = hdfs_filename
from datetime import date # hdfs_path = '/projects/projectfinder/raw/items/' +\ # date.today().year.__str__() + '/' +\ # date.today().month.__str__() + '/' #%% hdfs_path = '/projects/projectfinder/raw/items/2019' #%% hdfs_client.download(hdfs_path, 'hdfs_data', n_threads=5) #%% hdfs_client_status = hdfs_client.status('/', strict=True) hdfs_client_status #%% hdfs_file_status = hdfs_client.list(hdfs_path) hdfs_file_status #%% [markdown] # Go to [manuel](https://hdfscli.readthedocs.io/en/latest/advanced.html#path-expansion) # ```bash # # install hdfs using pip # pip install hdfs # ``` #%%
class HDFSLibrary: """ Test library for working with HDFS """ WEB_HDFS_URL = "" client = "" def __init__(self, namenode="localhost", port="50070"): self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port) print namenode, ">>", port, ">>", self.WEB_HDFS_URL self.client = InsecureClient(self.WEB_HDFS_URL) def check_hdfs_file_exists(self, file_path, stop=False): if None == self.client.status(file_path, strict=False): if stop: print "ERROR: Error: File does not exist: ", file_path return "ERROR: Error: File does not exist: ", file_path # exit(172) return False return True def get_hdfs_file_content(self, file_path): self.check_hdfs_file_exists(file_path, stop=True) data = "" with self.client.read(file_path) as reader: for line in reader: data += line return data def search_string_in_hdfs_file(self, file_path, text1, text2="aqwszx", text3="xzswqa"): ret = self.check_hdfs_file_exists(file_path, stop=True) found = "" if ret else ret with self.client.read(file_path) as reader: for line in reader: if line.find(text1) == -1 and line.find( text2) == -1 and line.find(text3) == -1: continue found += line return found def hdfs_file_should_not_contain(self, file_path, text1, text2="aqwszx", text3="xzswqa"): self.check_hdfs_file_exists(file_path, stop=True) with self.client.read(file_path) as reader: for line in reader: if line.find(text1) != -1 or line.find( text2) != -1 or line.find(text3) != -1: return False return True ######################## # # BASIC FUNCTIONS: # # ######################## def get_hdfs_file_folder_content_summary(self, file_path): """ Retrieving a file or folder content summary. :return: returns a file or folder content summary. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.content(file_path) def get_hdfs_file_folder_status(self, file_path): """ Retrieving a file or folder status. :return: returns a file or folder status. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.status(file_path) def list_hdfs_directory(self, folder_path): """ Listing all files inside a directory. :return: returns a file list. """ self.check_hdfs_file_exists(folder_path, stop=True) return self.client.list(folder_path) def move_hdfs_file(self, old_path, new_path): """ Renaming ("moving") a file. :return: NA """ self.check_hdfs_file_exists(old_path, stop=True) self.client.rename(old_path, new_path) def delete_hdfs_file(self, file_path): """ Deleting a file or folder recursively. :return: returns `True` if the deletion was successful otherwise `False` """ self.check_hdfs_file_exists(file_path) return self.client.delete(file_path, recursive=True) def copy_to_local_hdfs_file(self, hdfs_path, local_path): """ Copy a file or folder from HDFS to local. :return: local_path """ self.check_hdfs_file_exists(hdfs_path) return self.client.download(hdfs_path, local_path, overwrite=True, n_threads=4) def copy_from_local_hdfs_file(self, local_path, hdfs_path): """ Copy a file or folder from local to HDFS. :return: hdfs_path """ return self.client.upload(hdfs_path, local_path, overwrite=True, n_threads=4) def get_hdfs_file_checksum(self, file_path): """ Get the checksum value for file :return: checksum """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.checksum(file_path) def create_hdfs_dir(self, dir_path, perm=755): """ Create a directory or recursive dirs on HDFS :return: NA """ self.client.makedirs(dir_path, permission=perm)
from hdfs import InsecureClient import bson import time MAX_SIZE_FILE = 1024 * 1024 * 128 # 128mb (in bytes) # Init HDFS client = InsecureClient('http://X:50070', user='******') hdfs_dir = 'tweets/' hdfs_file = 'tweets.json' hdfs_files_list = client.list(hdfs_dir) if len(hdfs_files_list) > 0: # Get last file hdfs_file = sorted(hdfs_files_list, reverse=True)[0] hdfs_file_num = int(hdfs_file.split('.')[0]) hdfs_file_size = client.status(hdfs_dir + hdfs_file)['length'] # in bytes else: # Create file hdfs_file_num = 1 hdfs_file = str(hdfs_file_num) + '.json' hdfs_file_size = 0 # 0 bytes client.write(hdfs_dir + hdfs_file, '') # Init kafka consumer = KafkaConsumer('X', group_id='X', bootstrap_servers='X:9092') print time.strftime( "%Y-%m-%d %H:%M:%S") + ' [INFO] init KAFKA consumer and HDFS connection ok' # New kafka message for msg in consumer:
class HDFSStorage(Storage): """ HDFS storage """ def fix_slashes(self, path): sep = os.path.sep if path[0] != sep: path = sep + path if path[-1] != sep: path = path + sep return path def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts) def _open(self, name, mode='rb'): local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep)) if not os.path.exists(local_path): remote_path = self.path(name) local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) print(self.client.download(remote_path, local_path=local_path, overwrite=True, temp_dir=tempfile.gettempdir())) return File(open(local_path, mode)) def _save(self, name, content): print("_save(%s, %s, %s)" % (self, name, content)) local_path = content.name hdfs_path = self.path(name) # os.path.basename(local_path)) print(hdfs_path, local_path) self.client.write(hdfs_path, data=content, overwrite=True) return name def url(self, name): return self.fetch_url % name def delete(self, name): return self.client.delete(self.path(name)) def listdir(self, path): file_list = [] dir_list = [] for name, status in self.client.list(self.path(path), status=True): if status['type'] == 'DIRECTORY': dir_list.append(name) elif status['type'] == 'FILE': file_list.append(name) return dir_list, file_list def size(self, name): return self.client.status(self.path(name))['length'] def exists(self, name): try: return True if self.client.status(self.path(name)) else False except HdfsError: return False def path(self, name): return (self.hdfs_root + name).replace('\\', '/')
class WebHDFSStore(): ''' A file store based on the WebHDFS protocol. ''' # Set a refresh-date to indicate when we did this lookup: refresh_date = datetime.datetime.utcnow().isoformat( timespec='milliseconds') + 'Z' def __init__(self, service_id, user_override=None): self.service_id = service_id self.webhdfs_url = HADOOPS[service_id]['webhdfs_url'] self.webhdfs_user = HADOOPS[service_id]['webhdfs_user'] if user_override: self.webhdfs_user = user_override self.id_prefix = HADOOPS[service_id]['id_prefix'] self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user) def put(self, local_path, hdfs_path, backup_and_replace=False): # Get the status of the destination: dest_status = self.client.status(hdfs_path, strict=False) # Handle files or directories: if os.path.isfile(local_path): hdfs_path = self._combine_paths(dest_status, local_path, hdfs_path) self._upload_file(local_path, hdfs_path, backup_and_replace) elif os.path.isdir(local_path): # TODO, if it's a directory raise Exception( "Cannot upload anything other than single files at this time!") else: raise Exception("Unknown path type! Can't handle %s" % local_path) def _combine_paths(self, dest_status, local_path, hdfs_path): # If the hdfs_path is a directory, combine the paths: if dest_status and dest_status['type'] == 'DIRECTORY': combined_path = psp.join(hdfs_path, local_path) logger.info("Using combined path: %s" % combined_path) return combined_path else: # Otherwise, just return the path: return hdfs_path def _upload_file(self, local_path, hdfs_path, backup_and_replace=False): """ Copy up to HDFS, making it suitably atomic by using a temporary filename during upload. :return: None """ # Set up flag to record outcome: success = False # Calculate hash of local file: logger.info("Calculating hash of %s" % local_path) if not os.path.isfile(local_path): raise Exception("Cannot upload %s - individual files only!") local_hash = calculate_sha512_local(local_path) logger.info("Local %s hash is %s " % (local_path, local_hash)) # # TODO Allow upload to overwrite truncated files? # # Check if the destination file exists: already_exists = self.exists(hdfs_path) if already_exists and not backup_and_replace: logger.warning( "Path %s already exists! No upload will be attempted." % hdfs_path) else: # Upload to a temporary path: tmp_path = "%s_temp_" % hdfs_path # Now upload the file, allowing overwrites as this is a temporary file and # simultanous updates should not be possible: logger.info("Uploading as %s" % tmp_path) with open(local_path, 'rb') as reader, self.client.write( tmp_path, overwrite=True) as writer: while True: data = reader.read(10485760) if not data: break writer.write(data) # If set, backup-and-replace as needed: if backup_and_replace and already_exists: date_stamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H-%M-%S') backup_path = "%s.bkp_%s" % (hdfs_path, date_stamp) logger.warning("Renaming %s to %s..." % (hdfs_path, backup_path)) self.client.rename(hdfs_path, backup_path) # Move the uploaded file into the right place: logger.info("Renaming %s to %s..." % (tmp_path, hdfs_path)) self.client.rename(tmp_path, hdfs_path) # Give the namenode a moment to catch-up with itself and then check it's there: # FIXME I suspect this is only needed for our ancient HDFS time.sleep(2) status = self.client.status(hdfs_path) logger.info("Calculating hash of HDFS file %s" % hdfs_path) hdfs_hash = self.calculate_sha512(hdfs_path) logger.info("HDFS %s hash is %s " % (hdfs_path, hdfs_hash)) if local_hash != hdfs_hash: raise Exception("Local & HDFS hashes do not match for %s" % local_path) else: logger.info("Hashes are equal!") success = True # Log successful upload: logger.warning("Upload completed for %s" % hdfs_path) # And return success flag so caller knows it worked: return success def move(self, local_path, hdfs_path): # Perform the PUT first: success = self.put(local_path, hdfs_path) # And delete the local file if that worked: if success == True: os.remove(local_path) def calculate_sha512(self, path): ''' Calculate the SHA512 hash of a single file on HDFS ''' with self.client.read(path) as reader: file_hash = calculate_reader_hash(reader, path) return file_hash def _to_info(self, path, status): # Add the file path: status['file_path'] = path # Classify based on HDFS storage conventions: item = HdfsPathParser(status).to_dict() # Work out the permissions string: if status['permission'].isnumeric(): permissions = permissions_octal_to_string(int( status['permission'])) if status['type'] == 'DIRECTORY': permissions = "d" + permissions else: permissions = "-" + permissions else: permissions = status['permission'] # Defined fields based on directory/file status if permissions[0] == 'd': fs_type = 'directory' access_url = '%s/webhdfs/v1%s?op=LISTSTATUS&user.name=%s' % ( self.webhdfs_url, item['file_path'], self.webhdfs_user) else: fs_type = 'file' access_url = '%s/webhdfs/v1%s?op=OPEN&user.name=%s' % ( self.webhdfs_url, item['file_path'], self.webhdfs_user) # And return as a 'standard' dict: return { 'id': '%s%s' % (self.id_prefix, item['file_path']), 'refresh_date_dt': self.refresh_date, 'file_path_s': item['file_path'], 'file_size_l': item['file_size'], 'file_ext_s': item['file_ext'], 'file_name_s': item['file_name'], 'permissions_s': permissions, 'hdfs_replicas_i': item['number_of_replicas'], 'hdfs_user_s': item['user_id'], 'hdfs_group_s': item['group_id'], 'modified_at_dt': "%sZ" % item['modified_at'], 'timestamp_dt': "%sZ" % item['timestamp'], 'year_i': item['timestamp'][0:4], 'recognised_b': item['recognised'], 'kind_s': item['kind'], 'collection_s': item['collection'], 'stream_s': item['stream'], 'job_s': item['job'], 'layout_s': item['layout'], 'hdfs_service_id_s': self.service_id, 'hdfs_type_s': fs_type, 'access_url_s': access_url } def list(self, path, recursive=False): # Handle non-existant entry, or a file: path_status = self.client.status(path, strict=False) if path_status is None: raise Exception("No such file or directory: %s" % path) elif path_status['type'] == 'FILE': # Plain old file: yield self._to_info(path, path_status) else: # Handle folders: if recursive: for dir_info, dirs_info, files_info in self.client.walk( path, status=True): dir_path, dir_status = dir_info for file_name, file_status in files_info: file_path = psp.join(dir_path, file_name) yield self._to_info(file_path, file_status) else: for file_name, file_status in self.client.list(path, status=True): file_path = psp.join(path, file_name) yield self._to_info(file_path, file_status) def exists(self, path): status = self.client.status(path, strict=False) if status: return True else: return False def rm(self, path): # And delete from HDFS (usually prevented by API proxy) # Hard-coded to never act recursively - if you want that, do it manually via the back-end. self.client.delete(path, recursive=False) def stream(self, path, offset=0, length=None): # NOTE our WebHDFS service is very old and uses 'len' not 'length' for controlling the response length: # The API proxy we use attempts to remedy this by mapping any 'length' parameter to 'len'. return self.client.read(path, offset=offset, length=length) def read(self, path, offset=0, length=None): with self.stream(path, offset, length) as reader: while True: data = reader.read(10485760) if not data: break yield data def lsr_to_items(self, reader): """ This task processes a raw list of files generated by the hadoop fs -lsr command. As this can be a very large list, it avoids reading it all into memory. It parses each line, and yields a suitable stream of parsed objects matching the WebHDFS API. """ for line in reader: if "lsr: DEPRECATED: Please use 'ls -R' instead." in line: logger.warning(line) else: permissions, number_of_replicas, userid, groupid, filesize, modification_date, modification_time, filename = line.split( None, 7) filename = filename.strip() timestamp = datetime.datetime.strptime( '%s %s' % (modification_date, modification_time), '%Y-%m-%d %H:%M') info = { 'permission': permissions, 'replication': number_of_replicas, 'owner': userid, 'group': groupid, 'length': filesize, 'modificationTime': timestamp.timestamp() * 1000, 'pathSuffix': filename } # Skip directories: if permissions[0] != 'd': yield self._to_info(filename, info) info['type'] = 'DIRECTORY' else: info['type'] = 'FILE'
hive_template = 'hive://hive的IP:hive的端口号/{db}'.format(db=db) hive_path = '/user/hive/warehouse/{db}.db/{table}/ds={date}/{filename}'.format( db=db, table=table, filename=filename, date=date) local_path = '/home/data/superhero/redis_stats/info_{date}'.format( date=date) hdfs_path = '/tmp/{db}/{filename}'.format(**{ 'filename': filename, 'db': db }) try: hdfs_client = InsecureClient(hdfs_url) engine = sqlalchemy.create_engine(hive_template) conn_hive = engine.raw_connection() cur = conn_hive.cursor() # 检测文件是否存在 if hdfs_client.status(hive_path, strict=False): # 输出文件大小 print hdfs_client.status(hive_path, strict=False).get('length', '0') print 'Data In Hive!' else: print 'Warning: Data Not In Hive!'.format(table) # 删除hive文件 try: hdfs_client.delete(hive_path) print '{hive_path} Delete Complete'.format(hive_path=hive_path) except Exception, e: print e print '{hive_path} Delete Faild'.format(hive_path=hive_path) # 上传本地文件到Hive if os.path.exists(local_path):
class Uploader(): """ Initialise and set-up the HDFS connection: """ def __init__(self, hadoop_url, hadoop_user): # Set up client: self.hdfsClient = InsecureClient(hadoop_url, user=hadoop_user) def write_hash_file(self, path, hash, on_hdfs=False): if on_hdfs: raise Exception("Writing hash to HDFS not supported yet.") else: hash_path = "%s.sha512" % path if (os.path.exists(hash_path)): logger.warning("Hash file %s already exists." % hash_path) else: with open(hash_path, 'w') as hash_file: hash_file.write("%s\n" % hash) def safe_upload(self, localFile, hdfsFile, removeLocal=True): """ This performs a safe upload - it will never overwrite a file on HDFS, and it uses checksums to verify the transfer. :param localFile: :param hdfsFile: :return: """ # get local file hash and size localHash = get_checksum(localFile) self.write_hash_file(localFile, localHash) localSize = os.path.getsize(localFile) localModtime = datetime.fromtimestamp(os.path.getmtime(localFile)) # store checksum as a local file: # upload file to HDFS if not already existing hdfsFileStatus = self.hdfsClient.status(hdfsFile, strict=False) if hdfsFileStatus == None: logger.info('---- ----') logger.info("Copying %s to HDFS %s" % (localFile, hdfsFile)) logger.info("localFile size %i hash %s date %s" % (localSize, localHash, localModtime)) with open(localFile, 'r') as f: self.hdfsClient.write(data=f, hdfs_path=hdfsFile, overwrite=False) time.sleep(1) hdfsFileStatus = self.hdfsClient.status(hdfsFile, strict=False) # test if local and HDFS same if localSize != hdfsFileStatus['length']: logger.error( "hdfsFile %s size differs %i, %s size %i" % (hdfsFile, hdfsFileStatus['length'], localFile, localSize)) else: hdfsHash = get_checksum(hdfsFile, on_hdfs=True, hdfsClient=self.hdfsClient) if localHash != hdfsHash: logger.debug("hdfsFile %s hash differs %s, %s hash %s" % (hdfsFile, hdfsHash, localFile, localHash)) else: # if uploaded HDFS file hash same as local file hash, delete local file logger.info("hdfsFile size %i hash %s" % (hdfsFileStatus['length'], hdfsHash)) logger.info("Deleting %s" % localFile) os.remove(localFile) time.sleep(1)
class HaHadoopConnector: def __init__(self, logger, hdfsHosts, user): self.logger = logger self._hdfsHosts = hdfsHosts self._user = user self._hdfsCli = None self._connHdfsInfo = None self._maxRetry = 10 self._lock = Lock() def close(self): self._hdfsCli = None self._connHdfsInfo = None def _printDebug(self, message): if self.logger: self.logger.debug(message) else: print message def _printError(self, errorMsg): if self.logger: self.logger.warn(errorMsg) else: print "[warn] %s" % (errorMsg) def _printException(self, exception): if self.logger: self.logger.exception(exception) else: print traceback.format_exc(exception) def _extractSafeTime(self, message): splitedErrorMsg = message.split(' ') safeTime = splitedErrorMsg[len(splitedErrorMsg) - 2] if safeTime.isdigit(): return True, int(safeTime) + 5 else: return False, None def _setConnection(self): if self._connHdfsInfo and self._hdfsCli: return self._hdfsCli self._lock.acquire() for hdfsHost in self._hdfsHosts: try: self._hdfsCli = InsecureClient(hdfsHost, user=self._user) self._hdfsCli.status('/') self._connHdfsInfo = hdfsHost debugMsg = "connected hdfs : %s" % hdfsHost if self.logger: self.logger.debug(debugMsg) break except HdfsError, e: self.close() errorMsg = "hdfs error : %s, %s" % (str(e), hdfsHost) self._printError(errorMsg) except ConnectionError, e: self.close() errorMsg = "connection error : %s, %s" % (str(e), hdfsHost) time.sleep(1) self._printError(errorMsg) except Exception, e: self.close() errorMsg = "connection error : %s" % (hdfsHost) self._printError(errorMsg) self._printException(e) if self._lock: self._lock.release() raise Exception
def my_dag_function(): # хост со статически определённым IP hostname = '34.76.18.152' # порт ElasticSearch elk_port = 9200 # порт HDFS hdfs_port = 50070 # открываем соединение к ElasticSearch и получаем данные из индекса (забираем всё то, что нападало за последние 2 минуты). es = elasticsearch.Elasticsearch([hostname+":"+str(elk_port)]) res = es.search(index="dmitriy.voronko", body = {"query" : {"range" : {"@timestamp": {"gte" : "now-2m", "lt" : "now"}}}}, size = 500) # получаем название директории текущего дня обработки данных. curr_dir_name = (datetime.now()).strftime("%Y%m%d") print("Directory for current date: " + curr_dir_name) # Открываем соединение с HDFS и проверяем есть ли там файл в той директории, куда мы будем писать данные. Если его нету, то создаём файл, чтобы запись в режиме append работала # без ошибок. client_hdfs = InsecureClient("http://"+hostname+":"+str(hdfs_port), user="******") try: status = client_hdfs.status("/tmp/"+curr_dir_name+"/"+curr_dir_name+".json") except: client_hdfs.write("/tmp/"+curr_dir_name+"/"+curr_dir_name+".json", append=False, encoding="utf-8", data="") for doc in res['hits']['hits']: client_hdfs.write("/tmp/"+curr_dir_name+"/"+curr_dir_name+".json", encoding="utf-8", append=True, data=json.dumps(doc['_source'])) # пишем данные в ClickHouse. # возможно не самый оптимальный вариант, связанный с тем, что каждое сообщение из ElasticSearch индекса раскладывается на переменные, из которых потом формируется # tuple, вставляемый в таблицу в ClickHouse. client = Client('localhost', port=9011) for doc in res['hits']['hits']: data = json.loads(doc['_source']['message']) timestamp_v = data['timestamp'] referer_v = data['referer'] location_v = data['location'] remoteHost_v = data['remoteHost'] partyId_v = data['partyId'] sessionId_v = data['sessionId'] pageViewId_v = data['pageViewId'] eventType_v = data['eventType'] item_id_v = data['item_id'] item_price_v = int(data['item_price']) item_url_v = data['item_url'] basket_price_v = None if data['basket_price'] != '': basket_price_v = data['basket_price'] detectedDuplicate_ = 0 if data['detectedDuplicate'] == 'true': detectedDuplicate_v = 1 else: detectedDuplicate_v = 0 detectedCorruption_v = 0 if data['detectedCorruption'] == 'true': detectedCorruption_v = 1 else: detectedCorruption_v = 0 firstInSession_v = 0 if data['firstInSession'] == 'true': firstInSession_v = 1 else: else: firstInSession_v = 0 userAgentName_v = data['userAgentName'] client.execute('INSERT INTO lab1db.lab1_messages (timestamp, referer, location, remoteHost, partyId, sessionId, pageViewId, eventType, item_id, item_price, item_url, basket_price, detectedDuplicate, detectedCorruption, firstInSession, userAgentName) VALUES', [(timestamp_v, referer_v, location_v, remoteHost_v, partyId_v, sessionId_v, pageViewId_v, eventType_v, item_id_v, item_price_v, item_url_v, basket_price_v, detectedDuplicate_v, detectedCorruption_v, firstInSession_v, userAgentName_v)])
class HisiHdfs: def __init__(self): self._c = InsecureClient(url="http://{}:14000".format( HisiHdfs.get_host()), user='******', root="/") # self._c = InsecureClient(url="http://10.154.67.254:14000", user='******', root="/") @staticmethod def get_host(): domain = 'hdfs-ngx1.turing-ci.hisilicon.com' try: socket.gethostbyname(domain) return domain except Exception as e: return '10.154.67.254' @staticmethod def build_month_path(build_scene): '''daily build path''' return '/compilepackage/CI_Version/{}/br_hisi_trunk_ai/{}'.\ format(build_scene, datetime.datetime.today().strftime('%Y%m')) @staticmethod def prebuild_month_path(build_scene): '''compile path''' return '/compilepackage/CI_Version/{}/br_hisi_trunk_ai_PRE_COMPILE/{}'.\ format(build_scene, datetime.datetime.today().strftime('%Y%m')) def find_newest_build(self, build_scene): builds = self._c.list(HisiHdfs.build_month_path(build_scene), True) newest_build_name = None for build in builds: if type(build) != tuple: logging.warning("Unexpected build format {}".format(build)) continue if len(build) < 2: logging.warning("Unexpected build format {}".format(build)) continue if type(build[1]) != dict: logging.warning("Unexpected build format[1] {}".format(build)) continue if build[1].get('type', None) != "DIRECTORY": logging.warning( "Found unexpected build type(not DIRECTORY) {}".format( build)) continue if type(build[0]) != str: logging.warning("Unexpected build format[0] {}".format(build)) continue elements = build[0].split('_') if len(elements) != 3: logging.warning("Unexpected build name {}".format(build)) continue if elements[2] != "newest": continue # build_date = datetime.datetime.strptime('_'.join(elements[:2]), "%Y%m%d_%H%M%S%f") if newest_build_name is None: newest_build_name = build[0] continue if newest_build_name < build[0]: newest_build_name = build[0] return newest_build_name def path_exists(self, base_path: str, build_name: str): path = "{}/{}".format(base_path, build_name) return self._c.status(path, strict=False) is not None def find_package(self, base_path: str, build_name: str, package_type: PackageType, os_type=None, arch=None): if os_type is None: os_type, arch = get_env() path = "{}/{}".format(base_path, build_name) packages = self._c.list(path, True) pr = package_type.get_name_re() for package_name, package_info in packages: pm = pr.match(package_name) if pm is not None: if OsType.analyse_os(pm.group('os')) == os_type and pm.group( 'arch') == arch: return package_name return None def download_package(self, base_path: str, build_name: str, package_name: str, local_path: str): return self._c.download(hdfs_path="{}/{}/{}".format( base_path, build_name, package_name), local_path=local_path, overwrite=True) def download_compile_package(self, build_scene: str, build_name: str, package_name: str, local_path: str): return self.download_package(HisiHdfs.prebuild_month_path(build_scene), build_name, package_name, local_path) def download_daily_package(self, build_scene: str, build_name: str, package_name: str, local_path: str): return self.download_package(HisiHdfs.build_month_path(build_scene), build_name, package_name, local_path) def download_newest(self, local_path: str, packages: List[PackageType], os_type=None, arch=None): if not os.path.isdir(local_path): raise FileNotFoundError( "The path {} does not exists".format(local_path)) if os_type is None: os_type, arch = get_env() build_scenes_to_build_name = {} package_names = [] print("Begin to download newest run packages from the newest") for package in packages: build_scene = package.get_build_scene() newest_build_name = build_scenes_to_build_name.get( build_scene, self.find_newest_build(build_scene)) if newest_build_name is None: logging.error("Can not find the newest build") raise Exception("Can not find the newest build") package_name = self.find_package( HisiHdfs.build_month_path(build_scene), newest_build_name, package, os_type, arch) if package_name is None: logging.error( "Can not find the package {}, os {}, arch {}".format( package, os_type, arch)) raise Exception("Can not find package") with shell_printer.DotPrinter( "Begin to download {} from {} to {}".format( package_name, newest_build_name, local_path)): self.download_daily_package(build_scene, newest_build_name, package_name, local_path) logging.info("Download {} to {} successfully".format( package_name, local_path)) package_names.append(package_name) return package_names def download_compile_packages(self, build_name: str, local_path: str, package_types: List[PackageType]): self.wait_compile_paths_ready(package_types, build_name) package_names = [] for package_type in package_types: package_name = self.find_package( HisiHdfs.prebuild_month_path(package_type.get_build_scene()), build_name, package_type) if package_name is None: with shell_printer.DotPrinter("Wait package {} from {}".format( package_type.name, build_name)): while package_name is None: logging.debug( "Can not find package {} from {}, sleep".format( package_type.name, build_name)) time.sleep(10) package_name = self.find_package( HisiHdfs.prebuild_month_path( package_type.get_build_scene()), build_name, package_type) # 实测来看,刚创建好的文件直接下载可能有问题(下载失败,或者下载文件不完整),这里等5秒钟再下载 time.sleep(5) with shell_printer.DotPrinter("Begin to download {} to {}".format( package_name, local_path)): self.download_compile_package(package_type.get_build_scene(), build_name, package_name, local_path) logging.info("Download {} to {} successfully".format( package_name, local_path)) package_names.append(package_name) return package_names def wait_compile_paths_ready(self, package_types: List[PackageType], build_name: str): scenes = set([pt.get_build_scene() for pt in package_types]) for build_scene in scenes: build_path = HisiHdfs.prebuild_month_path(build_scene) if not self.path_exists(build_path, build_name): with shell_printer.DotPrinter( "The build({}) path({}) has not been created, wait". format(build_name, build_path)): while not self.path_exists(build_path, build_name): time.sleep(1)
class HdfsFile: '''HDFS File Object Keyword arguments: path -- HDFS file path mode -- one of ['r', 'rb', 'w', 'wb', 'a', 'ab'], all else will be seen as 'w' (default 'r') encoding -- should be specified if not in binary mode (default 'utf-8') ''' def __init__(self, path: str, mode: str = 'r', encoding: str = 'utf-8', host: str = HDFS_HOST, port: int = HDFS_PORT, user: str = HDFS_USER): self.client = InsecureClient(url=f'http://{host}:{port}', user=user) self.path = path self.name = path.split('\\')[-1] self.mode = mode self.encoding = encoding if self.mode[0] == 'r': self.__cache_content() self.fptr = 0 elif self.mode[0] == 'w': self.content = self.__binary_helper('') self.fptr = 0 elif self.mode[0] == 'a': self.__cache_content() self.fptr = len(self.content) else: raise UnsupportedMode(f'unsupported mode {self.mode}') def __binary_helper(self, content): if len(self.mode) > 1 and self.mode[1] == 'b': if isinstance(content, str): return content.encode(self.encoding) else: if isinstance(content, bytes): return content.decode(self.encoding) return content def __cache_content(self): if not self.exists(): raise FileNotFound() with self.client.read(self.path) as reader: self.content = self.__binary_helper(reader.read()) # iterable compatible def __iter__(self): return self # iterator compatible def __next__(self): buffer = self.readline() if buffer == self.__binary_helper(''): raise StopIteration() return buffer # for with ... as ... use def __enter__(self): return self # for with ... as ... use def __exit__(self, type, value, traceback): self.flush() def exists(self) -> bool: if self.client.status(hdfs_path=self.path, strict=False) is None: return False return True def read(self, size: int = None) -> str or bytes: if self.mode[0] != 'r': raise UnsupportedOperation(f'{self.mode} does not support read') if size is None or size < 0: offset = len(self.content) - self.fptr else: offset = size buffer = self.content[self.fptr:self.fptr + offset] self.fptr += offset return buffer def readline(self, size: int = None) -> str or bytes: if self.mode[0] != 'r': raise UnsupportedOperation(f'{self.mode} does not support read') offset = 0 while self.fptr + offset < len( self.content) and self.content[self.fptr + offset] not in [10, '\n']: offset += 1 offset += 1 buffer = self.content[self.fptr:self.fptr + offset] self.fptr += offset return buffer def seek(self, cookie: int): if not isinstance(cookie, int) or cookie < 0: raise InvalidParameterValue( f'cookie must be a non-negative integer') self.fptr = cookie def write(self, text: str or bytes) -> int: if self.mode[0] not in ['w', 'a']: raise UnsupportedOperation(f'{self.mode} does not support write') self.content += self.__binary_helper(text) return len(text) def flush(self): if self.mode[0] in ['w', 'a']: if not self.exists(): self.client.write(hdfs_path=self.path, data=self.content) else: self.client.write(hdfs_path=self.path, data=self.content, overwrite=True) def close(self): self.flush()
class HadoopFileSystem(object): def __init__(self, *opts): self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER']) # def make_tree(self, datasourceid, client, path): # tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[]) # try: lst = client.list(path, status=True) # except: # pass #ignore errors # else: # for fsitem in lst: # fn = os.path.join(path, fsitem[0]) # if fsitem[1]['type'] == "DIRECTORY": # tree['children'].append(make_hdfs_tree(datasourceid, client, fn)) # else: # tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []}) # return tree def make_json(self, datasourceid, base, relative_path): path = os.path.join(base, relative_path) data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) } status = self.client.status(path, False) if status is not None: if status['type'] == "DIRECTORY": data_json['type'] = DataType.Folder data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)] else: data_json['type'] = DataType.File #print(json.dumps(data_json)) return data_json def makedirs(self, path): try: self.client.makedirs(path) except: return None return path def delete(self, path): try: if self.client.status(path, False) is not None: self.client.delete(path, True) except Exception as e: print(e) def addfolder(self, path): i = 0 while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None: i += 1 return self.makedirs(os.path.join(path, "New Folder ({0})".format(i))) def rename(self, oldpath, newpath): try: self.client.rename(oldpath, newpath) except Exception as e: print(e) def saveUpload(self, file, fullpath): localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) if os.path.isfile(localpath): os.remove(localpath) try: file.save(localpath) self.client.upload(os.path.dirname(fullpath), localpath, True) except: pass def download(self, fullpath): status = self.client.status(fullpath, False) if status is not None and status['type'] == "FILE": localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) return self.client.download(fullpath, localpath, True) else: return None
class HDFSStorage(Storage): """ HDFS storage """ def fix_slashes(self, path): sep = os.path.sep if path[0] != sep: path = sep + path if path[-1] != sep: path = path + sep return path def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts) def _open(self, name, mode='rb'): local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep)) if not os.path.exists(local_path): remote_path = self.path(name) local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.mkdir(local_dir) print self.client.download(remote_path, local_path=local_path, overwrite=True, temp_dir=tempfile.gettempdir()) return File(open(local_path, mode)) def _save(self, name, content): print "_save(%s, %s, %s)" % (self, name, content) local_path = content.name hdfs_path = self.path(name) # os.path.basename(local_path)) print hdfs_path, local_path self.client.write(hdfs_path, data=content, overwrite=True) return name def url(self, name): return self.fetch_url % name def delete(self, name): return self.client.delete(self.path(name)) def listdir(self, path): file_list = [] dir_list = [] for name, status in self.client.list(self.path(path), status=True): if status['type'] == 'DIRECTORY': dir_list.append(name) elif status['type'] == 'FILE': file_list.append(name) return dir_list, file_list def size(self, name): return self.client.status(self.path(name))['length'] def exists(self, name): try: return True if self.client.status(self.path(name)) else False except HdfsError: return False def path(self, name): return (self.hdfs_root + name).replace('\\', '/')
class HadoopFileSystem(): def __init__(self, url, user): u = urlsplit(url) if u.scheme != 'http' and u.scheme != 'https': raise ValueError("Invalid name node address") self.url = urlunparse((u.scheme, u.netloc, '', '', '', '')) self.client = InsecureClient(self.url, user=user) self.localdir = u.path self.prefix = 'HDFS' def normalize_path(self, path): path = os.path.normpath(path) path = self.strip_prefix(path) while path and path[0] == os.sep: path = path[1:] return os.path.join(self.localdir, path) def strip_prefix(self, path): return path[len(self.prefix):] if path.startswith( self.prefix) else path def strip_root(self, path): path = self.strip_prefix(path) if path.startswith(self.url): path = path[len(self.url):] if not path.startswith(self.localdir): raise 'Invalid hdfs path. It must start with the root directory' return path[len(self.localdir):] if path.startswith( self.localdir) else path def create_folder(self, path): try: path = self.normalize_path(path) self.client.makedirs(path) except: return None return path def remove(self, path): try: path = self.normalize_path(path) if self.client.status(path, False) is not None: self.client.delete(path, True) except Exception as e: print(e) def rename(self, oldpath, newpath): try: oldpath = self.normalize_path(oldpath) newpath = self.normalize_path(newpath) self.client.rename(oldpath, newpath) except Exception as e: print(e) def get_files(self, path): path = self.normalize_path(path) files = [] for f in self.client.list(path): status = self.client.status(join(path, f), False) if status['type'] != "DIRECTORY": files.append(f) return files def get_folders(self, path): path = self.normalize_path(path) folders = [] for f in self.client.list(path): status = self.client.status(join(path, f), False) if status['type'] == "DIRECTORY": folders.append(f) return folders def exists(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return not (status is None) def isdir(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return status['type'] == "DIRECTORY" def isfile(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return status['type'] == "FILE" def read(self, path): path = self.normalize_path(path) with self.client.read(path) as reader: return reader.read().decode('utf-8') def write(self, path, content): path = self.normalize_path(path) self.client.write(path, content) def make_json(self, path): normalized_path = self.normalize_path(path) data_json = { 'path': urljoin(self.url, normalized_path), 'text': os.path.basename(path) } status = self.client.status(normalized_path, False) if status is not None: data_json['folder'] = status['type'] == "DIRECTORY" if status['type'] == "DIRECTORY": data_json['nodes'] = [ self.make_json(os.path.join(path, fn)) for fn in self.client.list(normalized_path) ] #print(json.dumps(data_json)) return data_json def save_upload(self, file, fullpath): localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) if os.path.isfile(localpath): os.remove(localpath) try: file.save(localpath) if isfile(fullpath): fullpath = os.path.dirname(fullpath) self.client.upload(self.normalize_path(fullpath), localpath, True) except: pass def download(self, path): path = self.normalize_path(path) status = self.client.status(path, False) if status is not None and status['type'] == "FILE": localpath = os.path.join(tempfile.gettempdir(), os.path.basename(path)) return self.client.download(path, localpath, True) else: return None
def start_hdfs_streaming(): while True: #---------------------- connection to MongoDB client ---------------------------- client = MongoClient() db = client['DB_filtereddata'] collection_edits = db['Edit_filtered_collection'] collection_others = db['other_filtered_collection'] #-------------------- loading data into pandas dataframe ------------------------ col_edits = collection_edits.find() df_edits = pd.DataFrame(col_edits) col_others = collection_others.find() df_others = pd.DataFrame(col_others) df = df_edits.append(df_others, ignore_index=True) if df.empty is False: df = df.drop_duplicates() print(df.count()) print(df.head()) #---------------------- connection to HDFS localhost ---------------------------- client_hdfs = InsecureClient('http://localhost:50070/', user="******") print(client_hdfs) filename = 'Filtereddata.csv' #------------------- writing data into HDFS from dataframe ---------------------- exist = client_hdfs.status(filename, strict=False) print(f'{filename} existing in: {exist}') try: if exist == None: with client_hdfs.write(filename, encoding='utf-8', overwrite=True) as writer: print(writer) df.to_csv(writer) print(f'Data saved in {filename} in {client_hdfs}') else: with client_hdfs.write(filename, encoding='utf-8', append=True) as writer: print(writer) df.to_csv(writer) print( f'Data appended to existing file {filename} in {client_hdfs}' ) except ValueError: pass #------------------- creating backup file into local storage ---------------------- records = df_edits.to_dict(orient='records') now = datetime.now() current_time = now.strftime("%H_%M") jsonpath = collection_edits.name + '_' + current_time + ".json" jsonpath = join("C:/Backup_Mongo/filtereddata", jsonpath) with open(jsonpath, 'w') as jsonfile: jsonfile.write(dumps(records)) print(f'Backup stored in {jsonpath}') records = df_others.to_dict(orient='records') jsonpath = collection_others.name + '_' + current_time + ".json" jsonpath = join("C:/Backup_Mongo/filtereddata", jsonpath) with open(jsonpath, 'w') as jsonfile: jsonfile.write(dumps(records)) print(f'Backup stored in {jsonpath}') #---------------- deleting HDFS stored records from the MongoDB ------------------ records = df_edits.to_dict(orient='records') record_ids = [record['_id'] for record in records] collection_edits.delete_many({'_id': {'$in': record_ids}}) print('stored edit records in HDFS and deleted from MongoDB') records = df_others.to_dict(orient='records') record_ids = [record['_id'] for record in records] collection_others.delete_many({'_id': {'$in': record_ids}}) print('stored other records in HDFS and deleted from MongoDB') else: print(f"no records found in the {collection_edits}") time.sleep(60)
class HDFS(BaseRepository): def __init__(self, host: str, port, user: str): super().__init__() self.host = host self.port = port self.user = user self.prodcuer = None def connect(self): self.conn = InsecureClient(f"http://{self.host}:{self.port}", user=self.user) if os.environ.get("KAFKA_BOOTSTRAP", None): self.producer = KafkaProducer(bootstrap_servers=os.environ.get( "KAFAKA_BOOTSTRAP", "localhost:1234")) else: self.producer = None def disconnect(self): self.save_snapshot() if self.prodcuer: self.producer.close() def insert_rows(self, rows: list[(datetime, str, str, str, str, str)]): self.add_buff(rows) self.flush() def _last_datetime(self, category, date): if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0: return config.min_date tfname = '' with tempfile.NamedTemporaryFile("wb") as tf: tfname = tf.name with self.conn.read(f"/krwordcloud/add-article/{date}", chunk_size=8096) as hf: for chunk in hf: tf.write(chunk) with open(tfname, 'rb') as tf: reader = pyorc.Reader(tf) maximum = datetime.datetime \ .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z") for row in reader: if row[0] > maximum and row[1] == category: maximum = row[0] if (maximum < config.min_date): return config.min_date elif maximum > datetime.datetime.now().replace(tzinfo=KST): return datetime.datetime.now().replace(tzinfo=KST) else: return maximum os.unlink(tfname) def make_entries(self): entries = dict() hdfs_entries = dict() lookup_hdfs = [] self.load_snapshot() for category in config.categories: category_rows = list( filter(lambda row: row[1] == category, self.buff)) if len(category_rows) > 0: last = max(category_rows, key=lambda row: row[0]) entries[category] = last[0] else: lookup_hdfs.append(category) try: dates = self.conn.list("/krwordcloud/add-article/") if len(dates) > 0: for category in lookup_hdfs: found = False for last in reversed(dates): try: entries[category] = self._last_datetime( category, last) found = True break except Exception as e: print(e) continue if found is False: entries[category] = config.min_date else: hdfs_entries = dict.fromkeys(lookup_hdfs, config.min_date) except HdfsError: entries[category] = config.min_date except Exception as e: print(e) return { k: v for k, v in sorted({ **entries, **hdfs_entries }.items(), key=lambda item: item[1]) } def save_snapshot(self): print('save_snapshot') with self.conn.write("/krwordcloud/snapshot.json", overwrite=True, encoding="utf-8") as f: data = list( map(lambda x: (x[0].isoformat(), x[1], x[2], x[3], x[4], x[5]), self.buff)) json.dump(data, f, ensure_ascii=False) def load_snapshot(self): print('load_snapshot') try: with self.conn.read("/krwordcloud/snapshot.json", encoding="utf-8") as f: self.buff = list( map( lambda x: (parser.parse(x[0]), x[1], x[2], x[3], x[4], x[5]), json.load(f))) except Exception: self.buff = [] def flush(self): dates = sorted(list(set(map(lambda row: row[0].date(), self.buff)))) if len(dates) > 1: for d in dates[:-1]: data = list(filter(lambda row: row[0].date() == d, self.buff)) if self.producer: self._kafka_flush(d, data) else: self._hdfs_flush(d, data) self.buff = list( filter(lambda row: row[0].date() == dates[-1], self.buff)) self.save_snapshot() def _kafka_flush(self, date, data): self.producer.send(f"add-article-{date}", data) def _hdfs_flush(self, date, data): with self.conn.write(f"/krwordcloud/add-article/{date}.orc", overwrite=True) as hf: tfname = '' with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf: tfname = tf.name with pyorc.Writer( tf, schema="struct<field0:timestamp,field1:string," + "field2:string,field3:string>", ) as of: of.writerows(data) with open(tfname, 'rb') as tf: for line in tf: hf.write(line) os.unlink(tfname)
class HDFSWrapper(object): def __init__(self): self.__m_HDFS_Handler__ = None self.__m_HDFS_WebFSDir__ = None self.__m_HDFS_User__ = None self.__m_HDFS_WebFSURL__ = None def HDFS_makedirs(self, hdfs_path): """ 创建目录 """ if self.__m_HDFS_Handler__ is None: raise HDFSWrapperException( "HDFS not connected. Please connect it frist.") self.__m_HDFS_Handler__.makedirs( os.path.join(self.__m_HDFS_WebFSDir__, hdfs_path).replace('\\', '/')) def HDFS_setPermission(self, hdfs_path, permission): """ 修改指定文件的权限信息 """ if self.__m_HDFS_Handler__ is None: raise HDFSWrapperException( "HDFS not connected. Please connect it frist.") m_hdfs_filepath = os.path.dirname(hdfs_path) m_hdfs_filename = os.path.basename(hdfs_path) self.__m_HDFS_Handler__.set_permission(os.path.join( self.__m_HDFS_WebFSDir__, m_hdfs_filepath, m_hdfs_filename).replace('\\', '/'), permission=permission) def HDFS_Connect(self, p_szURL, p_szUser): """ 连接HDFS, URL使用WEBFS协议 """ m_HDFS_Protocal = p_szURL.split("://")[0] m_HDFS_NodePort = p_szURL[len(m_HDFS_Protocal) + 3:].split("/")[0] m_HDFS_WebFSURL = m_HDFS_Protocal + "://" + m_HDFS_NodePort self.__m_HDFS_User__ = p_szUser self.__m_HDFS_WebFSURL__ = m_HDFS_WebFSURL self.__m_HDFS_WebFSDir__ = p_szURL[len(m_HDFS_WebFSURL):] self.__m_HDFS_Handler__ = InsecureClient(url=m_HDFS_WebFSURL, user=p_szUser, root=self.__m_HDFS_WebFSDir__) # 尝试创建目录,如果目录不存在的话 self.__m_HDFS_Handler__.makedirs( self.__m_HDFS_WebFSDir__.replace('\\', '/')) def HDFS_CD(self, p_szPath): self.__m_HDFS_WebFSDir__ = os.path.join(self.__m_HDFS_WebFSDir__, p_szPath) self.__m_HDFS_Handler__ = InsecureClient(url=self.__m_HDFS_WebFSURL__, user=self.__m_HDFS_User__, root=self.__m_HDFS_WebFSDir__) # 尝试创建目录,如果目录不存在的话 self.__m_HDFS_Handler__.makedirs( self.__m_HDFS_WebFSDir__.replace('\\', '/')) def HDFS_status(self, hdfs_path=""): """ 返回目录下的文件 """ if self.__m_HDFS_Handler__ is None: raise HDFSWrapperException( "HDFS not connected. Please connect it frist.") m_ReturnList = [] m_Status = self.__m_HDFS_Handler__.status(hdfs_path) m_ReturnList.append((hdfs_path, m_Status)) return m_ReturnList def HDFS_list(self, hdfs_path="", recusive=False): """ 返回目录下的文件 """ if self.__m_HDFS_Handler__ is None: raise HDFSWrapperException( "HDFS not connected. Please connect it frist.") m_ReturnList = [] if not recusive: for row in self.__m_HDFS_Handler__.list(hdfs_path, status=True): m_ReturnList.append((os.path.join(hdfs_path, row[0]), row[1])) return m_ReturnList else: for row in self.__m_HDFS_Handler__.list(hdfs_path, status=True): if row[1]['type'].upper() == 'DIRECTORY': m_ReturnList.append( (os.path.join(hdfs_path, row[0]).replace("\\", "/"), row[1])) m_ReturnList.extend( self.HDFS_list(os.path.join(hdfs_path, row[0]).replace("\\", "/"), recusive=True)) else: m_ReturnList.append( (os.path.join(hdfs_path, row[0]).replace("\\", "/"), row[1])) return m_ReturnList def HDFS_Download(self, hdfs_path="", local_path="", recusive=False): """ 从hdfs获取文件到本地 """ if self.__m_HDFS_Handler__ is None: raise HDFSWrapperException( "HDFS not connected. Please connect it frist.") # 如果本地没有对应目录,且local_path传递的是一个目录,则建立目录 m_LocalPath = local_path if m_LocalPath.endswith("/") and not os.path.exists(m_LocalPath): os.makedirs(m_LocalPath) m_FileList = self.HDFS_list(recusive=recusive) for row in m_FileList: if fnmatch.fnmatch(row[0], hdfs_path): self.__m_HDFS_Handler__.download(row[0], m_LocalPath, overwrite=True) def HDFS_Upload(self, local_path, hdfs_path=""): """ 上传文件到hdfs """ if self.__m_HDFS_Handler__ is None: raise HDFSWrapperException( "HDFS not connected. Please connect it frist.") for file in glob(local_path): if hdfs_path == "": m_hdfs_filepath = "" m_hdfs_filename = os.path.basename(file) else: if hdfs_path.endswith("/"): m_hdfs_filepath = hdfs_path m_hdfs_filename = os.path.basename(file) else: m_hdfs_filepath = os.path.dirname(hdfs_path) m_hdfs_filename = os.path.basename(hdfs_path) try: remote_status = self.__m_HDFS_Handler__.status( hdfs_path=os.path.join(self.__m_HDFS_WebFSDir__, m_hdfs_filepath).replace('\\', '/'), strict=True) if remote_status['type'] == "FILE": # 远程以为是目录的地方其实放了一个奇怪的文件,于是删掉它 self.__m_HDFS_Handler__.delete(os.path.join( self.__m_HDFS_WebFSDir__, m_hdfs_filepath).replace('\\', '/'), recursive=True) remote_status = self.__m_HDFS_Handler__.status( os.path.join(self.__m_HDFS_WebFSDir__, m_hdfs_filepath, m_hdfs_filename).replace('\\', '/')) if remote_status['type'] == "DIRECTORY": # 远程目录已经存在, 会尝试删除这个目录 self.__m_HDFS_Handler__.delete(os.path.join( self.__m_HDFS_WebFSDir__, m_hdfs_filepath, m_hdfs_filename).replace('\\', '/'), recursive=True) except HdfsError: # 远程目录不存在,后续的upload会建立该目录 pass self.__m_HDFS_Handler__.upload(os.path.join( self.__m_HDFS_WebFSDir__, m_hdfs_filepath, m_hdfs_filename).replace('\\', '/'), file, overwrite=True, cleanup=True) def Process_SQLCommand(self, p_szSQL): try: m_szSQL = p_szSQL.strip() matchObj = re.match(r"hdfs\s+connect\s+(.*)\s+with\s+user\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_HDFSServer = str(matchObj.group(1)).strip() m_HDFSUser = str(matchObj.group(2)).strip() self.HDFS_Connect(m_HDFSServer, m_HDFSUser) return None, None, None, None, "Hdfs Server set successful." matchObj = re.match(r"hdfs\s+cd\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_HDFSPath = str(matchObj.group(1)).strip() self.HDFS_CD(m_HDFSPath) return None, None, None, None, "Hdfs root dir change successful." matchObj = re.match(r"hdfs\s+status\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_TargetFileList = str(matchObj.group(1)).strip() m_ReturnFileList = self.HDFS_status(m_TargetFileList) m_Result = [] for (m_FileName, m_FileProperties) in m_ReturnFileList: if m_FileProperties["type"] == "FILE": m_PermissionMask = "-" elif m_FileProperties["type"] == "DIRECTORY": m_PermissionMask = "d" else: m_PermissionMask = "?" if len(m_FileProperties["permission"]) == 3: for m_nPos in range(0, 3): if m_FileProperties["permission"][m_nPos] == "0": m_PermissionMask = m_PermissionMask + "---" elif m_FileProperties["permission"][m_nPos] == "1": m_PermissionMask = m_PermissionMask + "--x" elif m_FileProperties["permission"][m_nPos] == "2": m_PermissionMask = m_PermissionMask + "-w-" elif m_FileProperties["permission"][m_nPos] == "3": m_PermissionMask = m_PermissionMask + "-wx" elif m_FileProperties["permission"][m_nPos] == "4": m_PermissionMask = m_PermissionMask + "r--" elif m_FileProperties["permission"][m_nPos] == "5": m_PermissionMask = m_PermissionMask + "r-x" elif m_FileProperties["permission"][m_nPos] == "6": m_PermissionMask = m_PermissionMask + "rw-" elif m_FileProperties["permission"][m_nPos] == "7": m_PermissionMask = m_PermissionMask + "rwx" else: m_PermissionMask = m_PermissionMask + "???" else: m_PermissionMask = m_PermissionMask + "?????????" m_ModifiedTime = str( datetime.datetime.utcfromtimestamp( m_FileProperties["modificationTime"] / 1000).strftime("%Y-%m-%d %H:%M:%S")) m_Result.append([ m_TargetFileList, m_PermissionMask, m_FileProperties["owner"], m_FileProperties["group"], m_FileProperties["length"], m_ModifiedTime ]) return "HDFS file status:", m_Result, ["Path", "Permission", "owner", "group", "Size", "Modified"], \ None, "Total " + str(len(m_Result)) + " files listed." matchObj = re.match(r"hdfs\s+rm\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: if matchObj: m_Bak_WebFSDir = self.__m_HDFS_WebFSDir__ m_FileDeleted = str(matchObj.group(1)).strip() m_FileDeletedPath = os.path.dirname(m_FileDeleted) m_FileDeletedName = os.path.basename(m_FileDeleted) self.HDFS_CD(m_FileDeletedPath) m_FileList = self.HDFS_list(self.__m_HDFS_WebFSDir__, recusive=False) for row in m_FileList: if fnmatch.fnmatch(os.path.basename(row[0]), m_FileDeletedName): self.__m_HDFS_Handler__.delete(row[0], recursive=True) # 重新返回原目录 self.HDFS_CD(m_Bak_WebFSDir) return None, None, None, None, "Hdfs file deleted successful." matchObj = re.match(r"hdfs\s+makedirs\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_Dir = str(matchObj.group(1)).strip() self.HDFS_makedirs(m_Dir) return None, None, None, None, "Hdfs directory created successful." matchObj = re.match(r"hdfs\s+set_permission\s+(.*)\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_File = str(matchObj.group(1)).strip() m_FilePermission = str(matchObj.group(2)).strip() self.HDFS_setPermission(m_File, m_FilePermission) return None, None, None, None, "Hdfs set permission successful." m_FileUpload = "" m_TargetDir = None matchObj = re.match(r"hdfs\s+upload\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_FileUpload = str(matchObj.group(1)).strip() m_TargetDir = "" matchObj = re.match(r"hdfs\s+upload\s+(.*)\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_FileUpload = str(matchObj.group(1)).strip() m_TargetDir = str(matchObj.group(2)).strip() if m_TargetDir is not None: self.HDFS_Upload(m_FileUpload, m_TargetDir) return None, None, None, None, "Hdfs file upload successful." m_FileDownload = "" m_TargetDir = None matchObj = re.match(r"hdfs\s+download\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_FileDownload = str(matchObj.group(1)).strip() m_TargetDir = "" matchObj = re.match(r"hdfs\s+download\s+(.*)\s+(.*)$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_FileDownload = str(matchObj.group(1)).strip() m_TargetDir = str(matchObj.group(2)).strip() if m_TargetDir is not None: self.HDFS_Download(m_FileDownload, m_TargetDir) return None, None, None, None, "Hdfs file download successful." m_TargetFileList = None matchObj = re.match(r"hdfs\s+list(\s+)?$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_TargetFileList = "" matchObj = re.match(r"hdfs\s+list\s+(.*)?$", m_szSQL, re.IGNORECASE | re.DOTALL) if matchObj: m_TargetFileList = str(matchObj.group(1)).strip() if m_TargetFileList is not None: m_ReturnFileList = self.HDFS_list(m_TargetFileList, recusive=True) m_Result = [] for (m_FileName, m_FileProperties) in m_ReturnFileList: if m_FileProperties["type"] == "FILE": m_PermissionMask = "-" elif m_FileProperties["type"] == "DIRECTORY": m_PermissionMask = "d" else: m_PermissionMask = "?" if len(m_FileProperties["permission"]) == 3: for m_nPos in range(0, 3): if m_FileProperties["permission"][m_nPos] == "0": m_PermissionMask = m_PermissionMask + "---" elif m_FileProperties["permission"][m_nPos] == "1": m_PermissionMask = m_PermissionMask + "--x" elif m_FileProperties["permission"][m_nPos] == "2": m_PermissionMask = m_PermissionMask + "-w-" elif m_FileProperties["permission"][m_nPos] == "3": m_PermissionMask = m_PermissionMask + "-wx" elif m_FileProperties["permission"][m_nPos] == "4": m_PermissionMask = m_PermissionMask + "r--" elif m_FileProperties["permission"][m_nPos] == "5": m_PermissionMask = m_PermissionMask + "r-x" elif m_FileProperties["permission"][m_nPos] == "6": m_PermissionMask = m_PermissionMask + "rw-" elif m_FileProperties["permission"][m_nPos] == "7": m_PermissionMask = m_PermissionMask + "rwx" else: m_PermissionMask = m_PermissionMask + "???" else: m_PermissionMask = m_PermissionMask + "?????????" m_ModifiedTime = str( datetime.datetime.utcfromtimestamp( m_FileProperties["modificationTime"] / 1000).strftime("%Y-%m-%d %H:%M:%S")) m_Result.append([ m_FileProperties["pathSuffix"], m_PermissionMask, m_FileProperties["owner"], m_FileProperties["group"], m_FileProperties["length"], m_ModifiedTime ]) return "HDFS file List:", m_Result, ["Path", "Permission", "owner", "group", "Size", "Modified"], \ None, "Total " + str(len(m_Result)) + " files listed." return None, None, None, None, "Unknown HDFS Command." except (HDFSWrapperException, HdfsError) as he: if "SQLCLI_DEBUG" in os.environ: print('traceback.print_exc():\n%s' % traceback.print_exc()) print('traceback.format_exc():\n%s' % traceback.format_exc()) raise SQLCliException(he.message)