def __init__(self, backup_dir, node): # TODO: not cut # each pending window (or node) only has a single downstream cut, # otherwise inconsistency occurs during truncating self.backup_dir = backup_dir self.node = node self.hdfs_client = Config().get_client('dev') self.hdfs_client.makedirs(self.backup_dir) # each backup file is named by the ending version, so the current writing one is named temporarily self.current_backup_path = os.path.join(self.backup_dir, 'current') # touch the file for later appending self.hdfs_client.write(self.current_backup_path, data='') # the version that last truncation conducted against self.safe_version_path = os.path.join(self.backup_dir, 'safe_version') # special case for initial version self.hdfs_client.write(self.safe_version_path, data=str(0)) # the latest integral version self.latest_version_path = os.path.join(self.backup_dir, 'latest_version') # special case for initial version self.hdfs_client.write(self.latest_version_path, data=str(0)) if self.node.type != 'sink': self.version_acks = dict() for n in self.node.downstream_connectors: self.version_acks[n] = 0
def __init__(self, sc, spark_session, uri, port): self.sc = sc self.spark_session = spark_session self.df = [] self.models = [] self.graphs = [] self.base_path = uri + ":" + port self.local_pickle_path = os.path.dirname( os.path.realpath(__file__)) + '/../pickles/' self.pickle_path = '/user/hadoop/pickles/' self.model_path = '/user/hadoop/pickles/models/' self.dataset_path = self.pickle_path + "dataset/" self.private_release_path = self.dataset_path + "private/" self.anon_release_path = self.dataset_path + "github/" self.prod_release_path = self.dataset_path + "prod/" self.df_path = self.pickle_path + 'df/' self.graph_path = self.local_pickle_path + 'graphs/' self.labelled_df_path = self.df_path + 'labelled/' self.hdfs_client = Config().get_client('dev') self.load_df() self.load_models() self.load_graphs()
def __init__(self): self.client = Config().get_client('dev') try: self.client.list('datasets') except: self.client.makedirs('datasets')
def main(): client = Config(path=hdfscliconf).get_client() with client.read('/user/orenault/passwd') as input: #print input.read() df = pd.read_csv(input, sep=':', header=None) cols = df.iloc[:, 0] client.write('/user/orenault/data.avro', cols.to_csv(sep=":", header=True, index=False), overwrite=True)
class HDFSLayout(Layout): def __init__(self, path, config=None, dynamic_getters=False, absolute_paths=True, regex_search=False): """ A container for all the files and metadata found at the specified path. Args: path (str): The root path of the layout. config (str): The path to the JSON config file that defines the entities and paths for the current layout. dynamic_getters (bool): If True, a get_{entity_name}() method will be dynamically added to the Layout every time a new Entity is created. This is implemented by creating a partial function of the get() function that sets the target argument to the entity name. absolute_paths (bool): If True, grabbit uses absolute file paths everywhere (including when returning query results). If False, the input path will determine the behavior (i.e., relative if a relative path was passed, absolute if an absolute path was passed). regex_search (bool): Whether to require exact matching (True) or regex search (False, default) when comparing the query string to each entity in .get() calls. This sets a default for the instance, but can be overridden in individual .get() requests. """ self._hdfs_client = Config().get_client() path = abspath(path) if absolute_paths and self._hdfs_client is None \ else path # Preprocess the config file if isinstance(config, six.string_types): config = '/'.join(config.strip('hdfs://').split('/')[1:]) config = config.replace(self._hdfs_client.root[1:], '') with self._hdfs_client.read(config) as reader: config = json.load(reader) super(HDFSLayout, self).__init__(path, config, dynamic_getters, absolute_paths, regex_search) def _get_files(self): self.root = '/'.join( self.root.strip('hdfs://').split('/')[1:]).replace( self._hdfs_client.root[1:], '') return self._hdfs_client.walk(self.root) def _make_file_object(self, root, f): filepath = str(psp.join(root, f)) with self._hdfs_client.read(filepath): return File(filepath)
def read(cls, file_path): lines = [] try: client = Config().get_client('dev') with client.read(file_path, encoding='utf-8', delimiter='\n') as reader: for line in reader: lines.append(line) # eventuell unnoetig, kann man auch reader zurueckgeben? except: print("ERROR: Could not read from HDFS.") raise return lines
def __init__(self): self.client = Config().get_client('dev') self.prompt = 'homura_fs $ ' self.name = None self.local_xml = None self.hdfs_xml = '.last_sync.xml' self.hdfs_loc_xml = None self.mount_root = None #os.getcwd() + '/test' self.hdfs_root = '/cs219' self.meta = HomuraMeta() self.monitor = None if sys.platform.startswith('darwin'): logging.basicConfig(filename='mylog.log', level=logging.INFO) self.monitor = Monitor_Start()
def main(): arg = parsing_options() client = Config().get_client() with client.read(arg.input) as inputFile: # Load file in dataframe df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header) inputFile.closed # Open output file with client.write(arg.output, overwrite=arg.overwrite) as outputFile: # Flatten the list of columns column = list(itertools.chain.from_iterable(arg.column)) # open RSA key key = get_key(arg.RSAkey,arg.operation) # Extract columns which need to be hashed / encrypted cols = df.iloc[:,column] colName = cols.columns if arg.operation == 'decrypt': # Do not forget the comma behind the privateRSA # the correct python grammer for a singleton tuple is (1,) not (1), # which is just an expr wth the value 1. df[colName]=df[colName].apply(decrypt, args=(key,), axis=1) df.to_csv(outputFile, sep=":", header=True, index=False) else: # Encrypt then hash - as otherwise we encrypt the hash value # Call function encrypt w/ RSAkey - Axis=1 for row encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1) # Rename header to not clash when merging df + encrypted data frame new_column=[] #for i in cols.columns: for i in colName: new_column.append(str(i) + '_ENC') encrypted.columns = new_column # Concatenate both dataframe df = pd.concat([df, encrypted], axis=1) # Generate a hash df[colName] = df[colName].apply(hash_value).values # Write to file df.to_csv(outputFile, sep=":", header=True, index=False)
def main(): conf = SparkConf().setAppName("binarize nifti") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') parser = argparse.ArgumentParser(description='Binarize images') parser.add_argument('threshold', type=int, help="binarization threshold") parser.add_argument('folder_path', type=str, help='folder path containing all of the splits') parser.add_argument('output_path', type=str, help='output folder path') parser.add_argument('num', type=int, choices=[2, 4, 6, 8], help='number of binarization operations') parser.add_argument('-m', '--in_memory', type=bool, default=True, help='in memory computation') args = parser.parse_args() nibRDD = sc.binaryFiles(args.folder_path)\ .map(lambda x: get_data(x)) client = Config().get_client('dev') if args.in_memory == 'True': print "Performing in-memory computations" for i in xrange(num - 1): nibRDD = nibRDD.map(lambda x: binarize(x, args.threshold)) nibRDD = nibRDD.map(lambda x: binarize_and_save( x, args.threshold, args.output_path, client)).collect() else: print "Writing intermediary results to disk and loading from disk" binRDD = nibRDD.map(lambda x: binarize_and_save( x, args.threshold, args.output_path + "1", client)).collect() for i in xrange(num - 1): binRDD = sc.binaryFiles(args.output_path + "1")\ .map(lambda x: get_data(x))\ .map(lambda x: binarize_and_save(x, args.threshold, args.output_path + "1", client)).collect()
def __init__(self, deviceInfoTableName, kind, dataBaseInfo, needFields="*", schema=None): self.dataBaseInfo = dataBaseInfo self.prefix = deviceInfoTableName self.kind = kind self.initDir = "/user/ct_fota/YangShuxuanNotDelete" self.iniFileName = self.kind + "/" + self.prefix + ".ini" self.needFields = needFields #self.initLog() self.connectDB() self.clientHDFS = Config().get_client() self.changtimes = 0 self.schema = schema
def main(): conf = SparkConf().setAppName("binarize nifti") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') parser = argparse.ArgumentParser( description='Binarize images using FSL installed in a Docker container' ) parser.add_argument('threshold', type=int, help="binarization threshold") parser.add_argument('folder_path', type=str, help='folder path containing all of the splits') parser.add_argument('output_path', type=str, help='output folder path') args = parser.parse_args() print args.folder_path client = Config().get_client('dev') nibRDD = sc.binaryFiles(args.folder_path)\ .map(lambda x: get_data(x))\ .map(lambda x: binarize(x, args.threshold))\ .map(lambda x: copy_to_hdfs(x, args.output_path, client)).collect()
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib, urllib.request from pyquery import PyQuery as pq from mongoconnect import * import hashlib from hdfs import Config client = Config().get_client('dev') KEY_WORD = 'news' exec('database=db_' + KEY_WORD) def fetchData(item): request = urllib.request.Request(item['href']) result = urllib.request.urlopen(request, timeout=25) if result.code == 200 or 204: ts = str(result.read(), encoding='gbk') d = pq(ts) d = d('div#content') head = d('div.hd h1').text() clas = d('div.a_Info span.a_catlog').text() source = d('div.a_Info span.a_source').text() time = d('div.a_Info span.a_time').text() body = d('div#Cnt-Main-Article-QQ p').text() print(time, ' ', clas, ' ', source, ' ', head) newhashid = hashlib.md5((head + time).encode()).hexdigest() print(body) #mongo updata class and source,
def __init__(self, profile): self.client = Config().get_client(profile)
# In[1]: import os from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext import hyperloglog from concurrent.futures import Future from hdfs import Config import subprocess try: client = Config().get_client() except: config_fname = "hdfscli.cfg" with open(config_fname, "wt") as f: f.write(""" [global] default.alias = default [default.alias] url = http://mipt-master.atp-fivt.org:50070 user = {user} """.format(user=os.environ["USER"])) client = Config(config_fname).get_client() nn_address = subprocess.check_output('hdfs getconf -confKey dfs.namenode.http-address', shell=True).strip().decode("utf-8")
schema_Freq_DF = typ.StructType([ typ.StructField("CHROM", typ.IntegerType(), False), typ.StructField("POS", typ.IntegerType(), False), typ.StructField("N_ALLELES", typ.IntegerType(), False), typ.StructField("N_CHR", typ.IntegerType(), False), typ.StructField("ALLELE_FREQ_1", typ.StringType(), False), typ.StructField("ALLELE_FREQ_2", typ.StringType(), False), typ.StructField("ID", typ.StringType(), True), ]) ############### ### Setting up File Paths and Lists ############### client = Config().get_client('dev') workingFolder_Indian = "SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/" workingFolder_Malay = "SgMalay_vcf/2012_05/snps/" workingFolder_Chinese = "1000G_CDX/Phase3/integrated/" # Filing number of unique samples found in the working folder... freqFiles_Indian = [ f for f in client.list(workingFolder_Indian) if re.match(r'chr\d+_analysis_exome\.frq', f) ] rsIDFiles_Indian = [ f for f in client.list(workingFolder_Indian)
# encoding: utf-8 """Sample HdfsCLI script. This example shows how to write files to HDFS, read them back, and perform a few other simple filesystem operations. """ from hdfs import Config from json import dump, load # Get the default alias' client. (See the quickstart section in the # documentation to learn more about this.) client = Config().get_client() # Some fake data that we are interested in uploading to HDFS. model = { '(intercept)': 48., 'first_feature': 2., 'second_feature': 12., } # First, we delete any existing `models/` folder on HDFS. client.delete('models', recursive=True) # We can now upload the data, first as CSV. with client.write('models/1.csv', encoding='utf-8') as writer: for item in model.items(): writer.write(u'%s,%s\n' % item)
class HomuraFS(): def __init__(self): self.client = Config().get_client('dev') self.prompt = 'homura_fs $ ' self.name = None self.local_xml = None self.hdfs_xml = '.last_sync.xml' self.hdfs_loc_xml = None self.mount_root = None #os.getcwd() + '/test' self.hdfs_root = '/cs219' self.meta = HomuraMeta() self.monitor = None if sys.platform.startswith('darwin'): logging.basicConfig(filename='mylog.log', level=logging.INFO) self.monitor = Monitor_Start() def shell_loop(self): while True: cmd = raw_input(self.prompt) if cmd == 'sync': print "Current devices attached:" id_mapping = dict() count = 1 if len(self.monitor.devs) == 0: print "No device attached" continue for dev in self.monitor.devs: #print dev devname = dev['Dname'] manufacture = dev['Man'] hname = dev['Hname'] id_mapping[count] = dev print "{}) Dname: {}, Hname: {}, Manufacture: {}.\n".format( count, devname, hname, manufacture) count += 1 dev_id = int(raw_input("Which device to sync:\n")) if dev_id == 0: continue if dev_id in id_mapping: #self.name = id_mapping[dev_id]['UID'] self.name = '' self.mount_root = id_mapping[dev_id]['Path'] self.local_xml = self.mount_root + '/.last_sync.xml' self.hdfs_loc_xml = self.mount_root + '/.cur_hdfs.xml' self.meta.myRootpath = self.mount_root log('Mount root is ' + self.mount_root) log('Device xml file is ' + self.local_xml) log('HDFS xml file is ' + self.hdfs_xml) log('Copy of HDFS xml stored at ' + self.hdfs_loc_xml) log('Syncing files for device ' + id_mapping[dev_id]['Dname']) self.sync_files() else: pass elif cmd == 'test': pass #log('Setting up test directory with default config') #self.__test() elif cmd == 'download': pass elif cmd == 'quit': if self.monitor: Monitor_Stop(self.monitor) return def download_all(self): log('Downloading all files from HDFS to local device') try: self.create_file(self.mount_root, self.hdfs_root, 1) for dir_or_file in os.listdir(self.mount_root + self.hdfs_root): if not dir_or_file.startswith('.'): shutil.move( self.mount_root + self.hdfs_root + '/' + dir_or_file, self.mount_root) shutil.rmtree(self.mount_root + self.hdfs_root) except: log('Something went wrog while downloading files') try: shutil.rmtree(self.mount_root + self.hdfs_root) except: pass self.meta.path2Xml(self.mount_root) self.meta.saveXml(self.local_xml, Xml='temp') def upload_all(self): log('Uploading all files from local device to HDFS') for dir_or_file in os.listdir(self.mount_root): if not dir_or_file.startswith('.'): try: log('Uploading to ' + self.hdfs_root + '/' + dir_or_file) self.client.upload(self.hdfs_root + '/' + dir_or_file, self.mount_root + '/' + dir_or_file, n_threads=0) except: log('Warning: could not upload') def load_HDFS_XML(self): log("Attempting to fetch HDFS xml") self.update_file(self.hdfs_loc_xml, self.hdfs_xml, 1) log("Loading HDFS xml") self.meta.loadHDFSXml(self.hdfs_loc_xml) os.remove(self.hdfs_loc_xml) def sync_files(self): # check if we have an old snapshot xml if not os.path.isfile( self.local_xml ): # snapshot doesn't exist, so download everything log("No local snapshot file was found at " + self.local_xml) self.meta.Snapshotdoc = self.meta.emptyXml() # use empty try: # fetch HDFS xml and store locally self.load_HDFS_XML() except: self.meta.HDFSdoc = self.meta.emptyXml() else: log("Fetching local snapshot xml from " + self.local_xml) self.meta.loadSnapshotXml(self.local_xml) try: # fetch HDFS xml and store locally self.load_HDFS_XML() except: self.meta.HDFSdoc = self.meta.emptyXml() self.meta.path2Xml(self.mount_root) self.meta.mydoc = self.meta.tempdoc #print 'HDFS XML:' #self.meta.showHDFSXml() #print '---\nSnapshot Xml' #self.meta.showSnapshotXml() #print '---\nLocal Xml' #self.meta.showMyXml() # find operations since last sync (my_creates, my_deletes, my_modifies, hdfs_creates, hdfs_deletes, hdfs_modifies) = self.meta.getOperations() root = self.mount_root name = self.hdfs_root # apply operations on current device for path in my_creates: if path.endswith('/'): # path is a folder we want to create os.makedirs(root + path) else: self.create_file(root + path, name + path, 1) for path in my_modifies: self.update_file(root + path, name + path, 1) for path in my_deletes: self.delete_file(root + path, 1) # apply operations on HDFS for path in hdfs_creates: if path.endswith('/'): # path is a folder we want to create self.client.makedirs(name + path) else: self.create_file(root + path, name + path, 0) for path in hdfs_modifies: self.update_file(root + path, name + path, 0) for path in hdfs_deletes: self.delete_file(name + path, 0) # update last sync for both HDFS and current device self.meta.path2Xml(self.mount_root) self.meta.saveXml(self.local_xml, Xml='temp') self.update_file(self.local_xml, self.hdfs_xml, 0) return # in this set of functions, when kyuubey = 0, the operation goes # from loc to hdfs (i.e. local becomes the "master") # when kyuubey = 1, the operation goes from hdfs to loc # (i.e. hdfs becomes the "master") def create_file(self, loc_path, hdfs_path, kyuubey): if kyuubey == 0: log('Creating ' + hdfs_path + ' on HDFS') self.client.upload(hdfs_path, loc_path, n_threads=0) elif kyuubey == 1: log('Creating ' + loc_path + ' locally') self.client.download(hdfs_path, loc_path, n_threads=0) def update_file(self, loc_path, hdfs_path, kyuubey): if kyuubey == 0: # updating file on HDFS log('Updating file ' + hdfs_path + ' on HDFS') with open(loc_path) as reader: with self.client.write(hdfs_path, overwrite=True) as writer: for line in reader: writer.write(line) elif kyuubey == 1: log('Updating file ' + loc_path + ' locally') with open(loc_path, 'w') as writer: with self.client.read(hdfs_path) as reader: data = reader.read() writer.write(data) def delete_file(self, path, kyuubey): if kyuubey == 0: # delete file on HDFS log('Deleting file ' + path + ' from HDFS') self.client.delete(path, recursive=True) elif kyuubey == 1: # delete file locally log('Deleting file ' + path + ' locally') os.remove(path) def move_file(self, src_path, dst_path, kyuubey): if kyuubey == 0: # move file on HDFS log('Moving file from ' + src_path + ' to ' + dst_path + ' on HDFS') self.client.rename(src_path, dst_path) elif kyuubey == 1: # move file locally os.rename(src_path, dst_path) log('Moving file from ' + src_path + ' to ' + dst_path + ' locally') def __test(self, test_no=1): self.__reset_test() if test_no == 1: self.__config_basic() elif test_no == 2: self.__config_outer_empty() def __reset_test(self): root = self.mount_root log('Resetting mount directory') if os.path.exists(root): shutil.rmtree(root) os.makedirs(root) def __config_basic(self): root = self.mount_root log('Config 1: default') with open(root + '/test1.txt', 'w') as writer: writer.write('hi\nthere\n!\n') with open(root + '/test2.txt', 'w') as writer: writer.write('one-liner') with open(root + '/test3.txt', 'w') as writer: writer.write('') os.makedirs(root + '/subdir') with open(root + '/subdir/test1.txt', 'w') as writer: writer.write('a different\ntest1.txt\nfile!\n') os.makedirs(root + '/subdir/subsubdir') with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer: writer.write('yet another different\ntest1.txt\nfile!\n') def __config_outer_empty(self): root = self.mount_root log('Config 2: outer directory empty') os.makedirs(root + '/subdir') with open(root + '/subdir/test1.txt', 'w') as writer: writer.write('a different\ntest1.txt\nfile!\n') os.makedirs(root + '/subdir/subsubdir') with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer: writer.write('yet another different\ntest1.txt\nfile!\n')
class Pickler: def __init__(self, sc, spark_session, uri, port): self.sc = sc self.spark_session = spark_session self.df = [] self.models = [] self.graphs = [] self.base_path = uri + ":" + port self.local_pickle_path = os.path.dirname( os.path.realpath(__file__)) + '/../pickles/' self.pickle_path = '/user/hadoop/pickles/' self.model_path = '/user/hadoop/pickles/models/' self.dataset_path = self.pickle_path + "dataset/" self.private_release_path = self.dataset_path + "private/" self.anon_release_path = self.dataset_path + "github/" self.prod_release_path = self.dataset_path + "prod/" self.df_path = self.pickle_path + 'df/' self.graph_path = self.local_pickle_path + 'graphs/' self.labelled_df_path = self.df_path + 'labelled/' self.hdfs_client = Config().get_client('dev') self.load_df() self.load_models() self.load_graphs() #TODO: Implement generic methods for read dataset / model ONLY def read(self): pass def save(self): pass def getLabelledFiles(self): return self.hdfs_client.list(self.prod_release_path) def readCSVToDF(self, date, folder): return self.spark_session.read.option( "header", True).csv(self.base_path + self.dataset_path + folder + "/" + date) def getLabelledTelemetry(self): return self.hdfs_client.list(self.private_release_path) def existsModel(self, name): res = self.hdfs_client.list(self.model_path) file_extension = '.model' if name + file_extension in res: return True def getModel(self, name): return PipelineModel.load(self.base_path + self.model_path + name + ".model") def isDateLabelled(self, date): res = self.hdfs_client.list(self.prod_release_path) file_extension = ".csv" if date + file_extension in res: return True return False def load_graphs(self): for file in os.listdir(self.graph_path): if file.endswith(".pickle"): self.graphs.append(file[:-7]) def existsGraph(self, date): if date in self.graphs: return True return False def getGraph(self, date): if date in self.graphs: with open(self.graph_path + date + ".pickle", 'rb') as pickle_file: content = pickle.load(pickle_file) return content def saveGraph(self, G, date): if date in self.graphs: return False nx.write_gpickle(G, self.graph_path + date + ".pickle") self.graphs.append(date) def existsDF(self, date, source): #2020.03.01_joy hash = self.getHash(date, source) if hash in self.df: return True return False def load_df(self): #Load Joy Data res = self.hdfs_client.list(self.df_path + 'joy') # print(f"Joy Items in directory: {res}") for file in res: if file.endswith(".parquet"): self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest()) #Load graph features DF res = self.hdfs_client.list(self.df_path + 'graph') # print(f"Graph DF Items in directory: {res}") for file in res: if file.endswith(".parquet"): self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest()) res = self.hdfs_client.list(self.df_path + 'labelled') # print(f"Labelled Items in directory: {res}") for file in res: if file.endswith(".parquet"): self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest()) # TODO : Load others? def saveModel(self, model, name): model.save(self.base_path + self.model_path + name + ".model") def load_models(self): res = self.hdfs_client.list(self.model_path) for file in res: if file.endswith(".model"): self.models.append(file.split('.')[0]) def saveDFToCSV(self, df, date, folder, coalesced=False): if coalesced: df.coalesce(1).write.csv(self.base_path + self.pickle_path + "dataset/" + folder + '/' + date + '.csv', header=True) else: df.write.csv(self.base_path + self.pickle_path + "dataset/" + folder + '/' + date + '.csv', header=True) df.write.parquet(self.base_path + self.pickle_path + "dataset/" + folder + '/' + date + '.parquet') def saveDF(self, df, date, source): hash = self.getHash(date, source) if hash in self.df: return False else: df.write.parquet(self.base_path + self.df_path + source + '/' + date + "_" + source + '.parquet') self.df.append(hash) def getDF(self, date, source): hash = self.getHash(date, source) if hash in self.df: df = self.spark_session.read.parquet(self.base_path + self.df_path + source + '/' + date + "_" + source + '.parquet') return df return False def getHash(self, date, source): id = date + "_" + source hash = sha256(id.encode('utf-8')).hexdigest() return hash
HDFS_OUTPUT_DIR = "/OUTPUT/" HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000" if __name__ == "__main__": # Folder creation for placing all the spark data cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/" os.system(cmd_a) # Configure Spark conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir", "/tmp/SPARK_PROCESS/") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) client = Config().get_client('bdrenhdfs') files = client.list(HDFS_RAWFILE_DIR) totalfilecount = len(files) if totalfilecount == 0: print("There is no files to be processed, application exiting...") sys.exit(0) filecount = 0 for filename in files: print(filename) if filename.find("Covid_Analysis_DataSet.csv") >= 0: filecount = filecount + 1 df_covid = sqlContext.read.format("csv").option( "delimiter",
def get_hdfs(alias='lake'): # https://hdfscli.readthedocs.io/en/latest/api.html from hdfs import Config client = Config().get_client(alias) return client
from hdfs import Config from sys import argv from math import ceil script, filename = argv client = Config().get_client() status = client.status(filename) print(ceil(status['length'] / status['blockSize']))
def __init__(self, debug=False): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.hdfscli.cfg') self.client = Config(path).get_client() self.debug = debug
class HadoopWebExplorer: def __init__(self, debug=False): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.hdfscli.cfg') self.client = Config(path).get_client() self.debug = debug def print(self, *args): if self.debug: print(*args) def path_exists(self, path): """ Checks whether such path already exists :param path: path to check :type path: unicode :return: boolean flag indicating whether path already exists or not :rtype: bool """ return self.client.status(path, strict=False) is not None @catch_hdfs_error def create_folder(self, folder_name): """ Creates folder with the given name if it does not exist :param folder_name: the name of the folder we want to add :type folder_name: unicode :return: returns true if created folder or it already exists, otherwise false :rtype: bool """ if self.path_exists(folder_name): print(f'Folder already exists: {folder_name}') return True self.print(f'Folder does not exist: {folder_name}') self.client.makedirs(folder_name) self.print(f'Folder created: {folder_name}') @catch_hdfs_error def write_to_file(self, folder_name, file_name, data, overwrite=False, append=False): """ Writes provided data into file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file where data should be written to :type file_name: unicode :param data: data to be written :type data: unicode :param overwrite: overwrite any existing file or directory :type overwrite: bool :param append: append to a file rather than create a new one. :type append: bool :return: returns true if it successfully wrote the data, otherwise false :rtype: bool """ path = os.path.join(folder_name, file_name) if append and not self.path_exists(path): self.client.write(path, data, encoding='utf-8', overwrite=overwrite) else: self.client.write(path, data, encoding='utf-8', overwrite=overwrite, append=append) self.print("Written data to HDFS file") @catch_hdfs_error def read_from_file(self, folder_name, file_name): """ Reads from file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file where data should be read from :type file_name: unicode """ path = os.path.join(folder_name, file_name) if not self.path_exists(path): self.print(f'File does not exists: {path}') return None return self.client.read(path) @catch_hdfs_error def delete_file(self, folder_name, file_name): """ Deletes file in the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :param file_name: name of the file to be deleted :type file_name: unicode :return: returns true if it successfully deleted the file, otherwise false :rtype: bool """ path = os.path.join(folder_name, file_name) return self.client.delete(path) @catch_hdfs_error def delete_folder(self, folder_name): """ Deletes the specified folder :param folder_name: name of the folder where file is located :type folder_name: unicode :return: returns true if it successfully deleted the folder, otherwise false :rtype: bool """ return self.client.delete(folder_name, recursive=True) @catch_hdfs_error def explore_folder(self, folder_name): """ Explores the specified folder :param folder_name: name of the folder to be observed :type folder_name: unicode """ if not self.path_exists(folder_name): self.print(f'Folder does not exists: {folder_name}') self.print(f'Exploring folder: {folder_name}') for path, dirs, files in self.client.walk(folder_name, status=True): for file in files: block_size = file[1]['blockSize'] size = file[1]['length'] owner = file[1]['owner'] self.print( f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}' )
from hdfs import Config, InsecureClient import cPickle as pickle from tuple import Tuple client = Config().get_client('dev') client.write('a/p', 'aaa', overwrite=True) print client.status('a')
def main(): arg = parsing_options() krb_client = Config(path=arg.hdfsConf).get_client() az_conf = read_conf(arg.azureConf) az_client = az_key_vault_connection(az_conf['azure_client_id'], az_conf['azure_client_secret'], az_conf['azure_tenant_id']) az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'], az_conf['key_name']) column = list(itertools.chain.from_iterable(arg.column)) with krb_client.read(arg.input) as inputFile: with krb_client.write(arg.output, overwrite=arg.overwrite) as outputFile: if arg.operation == 'encrypt': aes_key = generate_aes_key() az_conf['uuid'] = str(uuid.uuid4()) encrypt_and_store_aes_key(az_client, az_conf, az_rsa_key['version'], base64.b64encode(aes_key)) df = pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header, dtype=str, chunksize=10000) num_chunk = 0 for chunk in df: # Generate new column name and hash in place new_column = [] for i in column: new_column.append(str(i) + '_HASH') chunk[new_column] = chunk[column].apply(hash_value) # Encrypt in place chunk[column] = chunk[column].apply(encrypt, args=(aes_key, az_conf['uuid'])) if num_chunk == 0: chunk.to_csv(outputFile, sep=arg.delimiter, header=True, index=False) num_chunk += 1 else: chunk.to_csv(outputFile, sep=arg.delimiter, header=False, index=False) else: df = pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header, dtype=str, chunksize=1000) num_chunk = 0 for chunk in df: if num_chunk == 0: # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0] key = base64.b64decode(chunk[column[0]].str.split( pat='-', n=3, expand=True)[3][0]) aes_key = retrieve_and_decrypt_aes_key( az_client, az_conf, az_rsa_key['version'], key) chunk[column] = chunk[column].apply(decrypt, args=(aes_key, )) if num_chunk == 0: chunk.to_csv(outputFile, sep=arg.delimiter, header=True, index=False) num_chunk += 1 else: chunk.to_csv(outputFile, sep=arg.delimiter, header=False, index=False)
#! /user/bin/env python3 from hdfs import Config import sys client = Config().get_client() filename = sys.argv[1] with client.read(filename) as reader: ans = reader.read(10) print(ans.decode())
def main(argv): # Validamos entrada try: opts, args = getopt.getopt(argv, "hd:n:l:m:") except getopt.GetoptError: print( 'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\' ) print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ') print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]') sys.exit(2) for opt, arg in opts: if opt == '-h': print( 'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\' ) print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ') print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]') sys.exit() elif opt == "-s": C.images_index_url = arg elif opt == "-d": C.dir_classification = arg elif opt == "-n": C.numero_imagenes_proceso = int(arg) elif opt == "-l": C.lote_size = int(arg) elif opt == "-m": C.max_etiquetas = int(arg) print("Directorio : ", C.dir_classification) print("Num Imagenes a procesar: ", C.numero_imagenes_proceso) print("Imagenes por lote: ", C.lote_size) print("Max etiquetas a guardar: ", C.max_etiquetas) # *************************************************************************** # Inicio del proceso # *************************************************************************** global node_lookup_bc global model_data_bc # Iniciamos SparkContext print("Inicio: ", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')) sc = SparkContext( appName='Clasificacion MirFlickr con TensorFlow', pyFiles=[ '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/Constantes.py', '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/NodeLookup.py' ]) get_tensorflow_model() # Cargamos el modelo y lo distribuimos model_path = os.path.join(C.model_dir, 'classify_image_graph_def.pb') with tf.gfile.FastGFile(model_path, 'rb') as f: model_data = f.read() model_data_bc = sc.broadcast(model_data) # Distribuimos node lookup para ser utilizado en los workers node_lookup = NodeLookup().node_lookup node_lookup_bc = sc.broadcast(node_lookup) # Obtenemos una lista de las imágenes a procesar y las agrupamos en lotes servicio_imagenes = None try: servicio_imagenes = urllib.urlopen(C.images_index_url) except Exception as e: print(e) print("Servidor de imágenes no disponible") exit(404) imagenes = servicio_imagenes.read().split( '<li>')[2:C.numero_imagenes_proceso + 2] lote_imagenes = [ imagenes[i:i + C.lote_size] for i in range(0, len(imagenes), C.lote_size) ] # Paralelizamos los lotes de imagenes y procesamos rdd_imagenes = sc.parallelize(lote_imagenes).map( lambda x: map(obtener_nombre_imagen, x)) inception_rdd = rdd_imagenes.flatMap(procesar_lote_imagenes) # Borramos directorio categorias del hdfs por si existiera client = Config().get_client() client.delete('inception', recursive=True) # Salvamos los ficheros obtenidos en formato json. Para ello usamos un dataframe print("Procesamos:", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')) spark = SparkSession(sc) inception_df = inception_rdd.toDF() inception_df.write.json(C.dir_classification) print("Fin:", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
#!/usr/bin/env python # encoding: utf-8 """Avro extension example.""" from hdfs import Config from hdfs.ext.avro import AvroReader, AvroWriter # Get the default alias' client. client = Config().get_client() # Some sample data. records = [ { 'name': 'Ann', 'age': 23 }, { 'name': 'Bob', 'age': 22 }, ] # Write an Avro File to HDFS (since our records' schema is very simple, we let # the writer infer it automatically, otherwise we would pass it as argument). with AvroWriter(client, 'names.avro', overwrite=True) as writer: for record in records: writer.write(record) # Read it back. with AvroReader(client, 'names.avro') as reader: schema = reader.schema # The inferred schema.
class SparkHDFSClient(object): def __init__(self, datasource): self.datasource = datasource self.client = Config().get_client("dev") def get_file_list(self, folder): files = self.client.list(folder.strip()) files = [folder + '/' + file for file in files] return files def list_collections(self): results = [] status = self.client.status(self.datasource.url, strict=False) print(status, self.datasource.url) if status is not None: if status['type'] == "DIRECTORY": files = self.get_file_list(self.datasource.url) while len(files) > 0: file = files.pop() status = self.client.status(os.path.join( self.datasource.url, file), strict=False) if status is None: continue if status['type'] == "DIRECTORY": subfiles = self.get_file_list( os.path.join(self.datasource.url, file)) files.extend(subfiles) continue else: if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \ or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\ or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\ or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json': continue row = { "db": file[:file.rfind('/')] if '/' in file else self.datasource.url, "document": file[file.rfind('/') + 1:] if '/' in file else file, "count": -1 } results.append(row) return results else: return [{ "db": self.datasource.url, "document": self.datasource.url, "count": -1 }] else: return results def get_documents(self, filename, limit=10): results = [] delimiter = "\n" header = None rows = 0 if self.datasource.dstype == DataSourceType.SPARK_CSV or \ self.datasource.dstype == DataSourceType.SPARK_TSV: delimiter = "\n" with self.client.read(filename, encoding='utf-8', delimiter=delimiter) as reader: for line in reader: if len(line.strip()) == 0 or line[0] == '#': continue if filename[-3:] == "csv": line = line.split(',') else: line = line.split('\t') if header is None: header = line continue res = { header[i]: line[i] for i in range(len(line)) if i < len(header) } results.append(res) rows += 1 if rows > limit + 1: break elif self.datasource.dstype == DataSourceType.SPARK_XML: with self.client.read(filename, encoding='utf-8', chunk_size=2048) as reader: header = ['content'] for chunk in reader: res = {'content': str(chunk)} results.append(res) print(results) break elif self.datasource.dstype == DataSourceType.SPARK_JSON: with self.client.read(filename, encoding='utf-8') as reader: model = load(reader) if isinstance(model, list): model = [{ p: str(list(md[p][0].keys())) if isinstance(md[p], list) and isinstance(md[p][0], dict) else str(model[p]) if isinstance(md[p], list) else str(list(md[p].keys())) if isinstance(md[p], dict) else md[p] for p in md } for md in model] results.extend(model) else: model = { p: str(list(model[p][0].keys())) if isinstance(model[p], list) and isinstance( model[p][0], dict) else model[p] if isinstance( model[p], list) else str(list(model[p].keys())) if isinstance(model[p], dict) else model[p] for p in model } results.append(model) return results[:limit], limit
class PendingWindow(object): """docstring for PendingWindow""" def __init__(self, backup_dir, node): # TODO: not cut # each pending window (or node) only has a single downstream cut, # otherwise inconsistency occurs during truncating self.backup_dir = backup_dir self.node = node self.hdfs_client = Config().get_client('dev') self.hdfs_client.makedirs(self.backup_dir) # each backup file is named by the ending version, so the current writing one is named temporarily self.current_backup_path = os.path.join(self.backup_dir, 'current') # touch the file for later appending self.hdfs_client.write(self.current_backup_path, data='') # the version that last truncation conducted against self.safe_version_path = os.path.join(self.backup_dir, 'safe_version') # special case for initial version self.hdfs_client.write(self.safe_version_path, data=str(0)) # the latest integral version self.latest_version_path = os.path.join(self.backup_dir, 'latest_version') # special case for initial version self.hdfs_client.write(self.latest_version_path, data=str(0)) if self.node.type != 'sink': self.version_acks = dict() for n in self.node.downstream_connectors: self.version_acks[n] = 0 def append(self, tuple_): """Make an output tuple persistent, and complete a version if necessary """ self.hdfs_client.write(self.current_backup_path, data=pickle.dumps(tuple_), append=True) if isinstance(tuple_, BarrierTuple): self.hdfs_client.rename( self.current_backup_path, os.path.join(self.backup_dir, str(tuple_.version))) self.hdfs_client.write(self.latest_version_path, data=str(tuple_.version), overwrite=True) self.hdfs_client.write(self.current_backup_path, data='') def extend(self, tuples): # TODO: can be improved with self.hdfs_client.write(self.current_backup_path, append=True) as f: for t in tuples: pickle.dump(t, f) if isinstance(tuples[-1], BarrierTuple): self.hdfs_client.rename( self.current_backup_path, os.path.join(self.backup_dir, str(tuples[-1].version))) self.hdfs_client.write(self.latest_version_path, data=str(tuples[-1].version), overwrite=True) self.hdfs_client.write(self.current_backup_path, data='') def truncate(self, version): """Delete files with filename <= version """ # with self.hdfs_client.read(self.safe_version_path) as f: # safe_version = int(f.read()) # # # only = condition can occur # if version <= safe_version: # return for f in self.hdfs_client.list(self.backup_dir): if f.isdigit() and int(f) <= version: self.hdfs_client.delete(os.path.join(self.backup_dir, f)) # self.node.LOGGER.info('truncated version %d' % version) def handle_version_ack(self, version_ack): old_safe_version = min(self.version_acks.values()) self.version_acks[version_ack.sent_from] = version_ack.version new_safe_version = min(self.version_acks.values()) if new_safe_version > old_safe_version: self.hdfs_client.write(self.safe_version_path, data=str(new_safe_version), overwrite=True) self.truncate(new_safe_version) def get_latest_version(self): with self.hdfs_client.read(self.latest_version_path) as f: latest_version = int(f.read()) return latest_version def rewind(self, version=None): """Delete files with filename > version (including current file) """ if version == None: self.hdfs_client.write(self.current_backup_path, data='', overwrite=True) return # TODO: underflow # assert version == 0 or for f in self.hdfs_client.list(self.backup_dir): if f.isdigit() and int(f) > version: self.hdfs_client.delete(os.path.join(self.backup_dir, f)) self.hdfs_client.write(self.current_backup_path, data='', overwrite=True) self.hdfs_client.write(self.latest_version_path, data=str(version), overwrite=True) def replay(self): """When both the node and pending window state are ready, replay the pending window before resuming """ for v in sorted( map( int, filter(unicode.isdigit, self.hdfs_client.list(self.backup_dir)))): # filter out the faster nodes tuples = [] with self.hdfs_client.read(os.path.join(self.backup_dir, str(v))) as f: while True: try: t = pickle.load(f) tuples.append(t) except EOFError: self.node.LOGGER.debug( 'reached EOF, send this version') break # Spout needs version too, so that data source can resend from a version # except pickle.UnpickleableError: # self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version') # break self.node.multicast(self.node.downstream_nodes, tuples)
def __init__(self, datasource): self.datasource = datasource self.client = Config().get_client("dev")