def __init__(self, sc, spark_session, uri, port): self.sc = sc self.spark_session = spark_session self.df = [] self.models = [] self.graphs = [] self.base_path = uri + ":" + port self.local_pickle_path = os.path.dirname( os.path.realpath(__file__)) + '/../pickles/' self.pickle_path = '/user/hadoop/pickles/' self.model_path = '/user/hadoop/pickles/models/' self.dataset_path = self.pickle_path + "dataset/" self.private_release_path = self.dataset_path + "private/" self.anon_release_path = self.dataset_path + "github/" self.prod_release_path = self.dataset_path + "prod/" self.df_path = self.pickle_path + 'df/' self.graph_path = self.local_pickle_path + 'graphs/' self.labelled_df_path = self.df_path + 'labelled/' self.hdfs_client = Config().get_client('dev') self.load_df() self.load_models() self.load_graphs()
def __init__(self, backup_dir, node): # TODO: not cut # each pending window (or node) only has a single downstream cut, # otherwise inconsistency occurs during truncating self.backup_dir = backup_dir self.node = node self.hdfs_client = Config().get_client('dev') self.hdfs_client.makedirs(self.backup_dir) # each backup file is named by the ending version, so the current writing one is named temporarily self.current_backup_path = os.path.join(self.backup_dir, 'current') # touch the file for later appending self.hdfs_client.write(self.current_backup_path, data='') # the version that last truncation conducted against self.safe_version_path = os.path.join(self.backup_dir, 'safe_version') # special case for initial version self.hdfs_client.write(self.safe_version_path, data=str(0)) # the latest integral version self.latest_version_path = os.path.join(self.backup_dir, 'latest_version') # special case for initial version self.hdfs_client.write(self.latest_version_path, data=str(0)) if self.node.type != 'sink': self.version_acks = dict() for n in self.node.downstream_connectors: self.version_acks[n] = 0
def __init__(self): self.client = Config().get_client('dev') try: self.client.list('datasets') except: self.client.makedirs('datasets')
def main(): client = Config(path=hdfscliconf).get_client() with client.read('/user/orenault/passwd') as input: #print input.read() df = pd.read_csv(input, sep=':', header=None) cols = df.iloc[:, 0] client.write('/user/orenault/data.avro', cols.to_csv(sep=":", header=True, index=False), overwrite=True)
def __init__(self): self.client = Config().get_client('dev') self.prompt = 'homura_fs $ ' self.name = None self.local_xml = None self.hdfs_xml = '.last_sync.xml' self.hdfs_loc_xml = None self.mount_root = None #os.getcwd() + '/test' self.hdfs_root = '/cs219' self.meta = HomuraMeta() self.monitor = None if sys.platform.startswith('darwin'): logging.basicConfig(filename='mylog.log', level=logging.INFO) self.monitor = Monitor_Start()
def main(): conf = SparkConf().setAppName("binarize nifti") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') parser = argparse.ArgumentParser(description='Binarize images') parser.add_argument('threshold', type=int, help="binarization threshold") parser.add_argument('folder_path', type=str, help='folder path containing all of the splits') parser.add_argument('output_path', type=str, help='output folder path') parser.add_argument('num', type=int, choices=[2, 4, 6, 8], help='number of binarization operations') parser.add_argument('-m', '--in_memory', type=bool, default=True, help='in memory computation') args = parser.parse_args() nibRDD = sc.binaryFiles(args.folder_path)\ .map(lambda x: get_data(x)) client = Config().get_client('dev') if args.in_memory == 'True': print "Performing in-memory computations" for i in xrange(num - 1): nibRDD = nibRDD.map(lambda x: binarize(x, args.threshold)) nibRDD = nibRDD.map(lambda x: binarize_and_save( x, args.threshold, args.output_path, client)).collect() else: print "Writing intermediary results to disk and loading from disk" binRDD = nibRDD.map(lambda x: binarize_and_save( x, args.threshold, args.output_path + "1", client)).collect() for i in xrange(num - 1): binRDD = sc.binaryFiles(args.output_path + "1")\ .map(lambda x: get_data(x))\ .map(lambda x: binarize_and_save(x, args.threshold, args.output_path + "1", client)).collect()
def main(): arg = parsing_options() client = Config().get_client() with client.read(arg.input) as inputFile: # Load file in dataframe df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header) inputFile.closed # Open output file with client.write(arg.output, overwrite=arg.overwrite) as outputFile: # Flatten the list of columns column = list(itertools.chain.from_iterable(arg.column)) # open RSA key key = get_key(arg.RSAkey,arg.operation) # Extract columns which need to be hashed / encrypted cols = df.iloc[:,column] colName = cols.columns if arg.operation == 'decrypt': # Do not forget the comma behind the privateRSA # the correct python grammer for a singleton tuple is (1,) not (1), # which is just an expr wth the value 1. df[colName]=df[colName].apply(decrypt, args=(key,), axis=1) df.to_csv(outputFile, sep=":", header=True, index=False) else: # Encrypt then hash - as otherwise we encrypt the hash value # Call function encrypt w/ RSAkey - Axis=1 for row encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1) # Rename header to not clash when merging df + encrypted data frame new_column=[] #for i in cols.columns: for i in colName: new_column.append(str(i) + '_ENC') encrypted.columns = new_column # Concatenate both dataframe df = pd.concat([df, encrypted], axis=1) # Generate a hash df[colName] = df[colName].apply(hash_value).values # Write to file df.to_csv(outputFile, sep=":", header=True, index=False)
def __init__(self, deviceInfoTableName, kind, dataBaseInfo, needFields="*", schema=None): self.dataBaseInfo = dataBaseInfo self.prefix = deviceInfoTableName self.kind = kind self.initDir = "/user/ct_fota/YangShuxuanNotDelete" self.iniFileName = self.kind + "/" + self.prefix + ".ini" self.needFields = needFields #self.initLog() self.connectDB() self.clientHDFS = Config().get_client() self.changtimes = 0 self.schema = schema
def __init__(self, path, config=None, dynamic_getters=False, absolute_paths=True, regex_search=False): """ A container for all the files and metadata found at the specified path. Args: path (str): The root path of the layout. config (str): The path to the JSON config file that defines the entities and paths for the current layout. dynamic_getters (bool): If True, a get_{entity_name}() method will be dynamically added to the Layout every time a new Entity is created. This is implemented by creating a partial function of the get() function that sets the target argument to the entity name. absolute_paths (bool): If True, grabbit uses absolute file paths everywhere (including when returning query results). If False, the input path will determine the behavior (i.e., relative if a relative path was passed, absolute if an absolute path was passed). regex_search (bool): Whether to require exact matching (True) or regex search (False, default) when comparing the query string to each entity in .get() calls. This sets a default for the instance, but can be overridden in individual .get() requests. """ self._hdfs_client = Config().get_client() path = abspath(path) if absolute_paths and self._hdfs_client is None \ else path # Preprocess the config file if isinstance(config, six.string_types): config = '/'.join(config.strip('hdfs://').split('/')[1:]) config = config.replace(self._hdfs_client.root[1:], '') with self._hdfs_client.read(config) as reader: config = json.load(reader) super(HDFSLayout, self).__init__(path, config, dynamic_getters, absolute_paths, regex_search)
def main(): conf = SparkConf().setAppName("binarize nifti") sc = SparkContext(conf=conf) sc.setLogLevel('ERROR') parser = argparse.ArgumentParser( description='Binarize images using FSL installed in a Docker container' ) parser.add_argument('threshold', type=int, help="binarization threshold") parser.add_argument('folder_path', type=str, help='folder path containing all of the splits') parser.add_argument('output_path', type=str, help='output folder path') args = parser.parse_args() print args.folder_path client = Config().get_client('dev') nibRDD = sc.binaryFiles(args.folder_path)\ .map(lambda x: get_data(x))\ .map(lambda x: binarize(x, args.threshold))\ .map(lambda x: copy_to_hdfs(x, args.output_path, client)).collect()
def __init__(self, profile): self.client = Config().get_client(profile)
#!/usr/bin/env python # encoding: utf-8 """Avro extension example.""" from hdfs import Config from hdfs.ext.avro import AvroReader, AvroWriter # Get the default alias' client. client = Config().get_client() # Some sample data. records = [ { 'name': 'Ann', 'age': 23 }, { 'name': 'Bob', 'age': 22 }, ] # Write an Avro File to HDFS (since our records' schema is very simple, we let # the writer infer it automatically, otherwise we would pass it as argument). with AvroWriter(client, 'names.avro', overwrite=True) as writer: for record in records: writer.write(record) # Read it back. with AvroReader(client, 'names.avro') as reader: schema = reader.schema # The inferred schema.
def get_hdfs(alias='lake'): # https://hdfscli.readthedocs.io/en/latest/api.html from hdfs import Config client = Config().get_client(alias) return client
def __init__(self, debug=False): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.hdfscli.cfg') self.client = Config(path).get_client() self.debug = debug
HDFS_OUTPUT_DIR = "/OUTPUT/" HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000" if __name__ == "__main__": # Folder creation for placing all the spark data cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/" os.system(cmd_a) # Configure Spark conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir", "/tmp/SPARK_PROCESS/") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) client = Config().get_client('bdrenhdfs') files = client.list(HDFS_RAWFILE_DIR) totalfilecount = len(files) if totalfilecount == 0: print("There is no files to be processed, application exiting...") sys.exit(0) filecount = 0 for filename in files: print(filename) if filename.find("Covid_Analysis_DataSet.csv") >= 0: filecount = filecount + 1 df_covid = sqlContext.read.format("csv").option( "delimiter",
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib, urllib.request from pyquery import PyQuery as pq from mongoconnect import * import hashlib from hdfs import Config client = Config().get_client('dev') KEY_WORD = 'news' exec('database=db_' + KEY_WORD) def fetchData(item): request = urllib.request.Request(item['href']) result = urllib.request.urlopen(request, timeout=25) if result.code == 200 or 204: ts = str(result.read(), encoding='gbk') d = pq(ts) d = d('div#content') head = d('div.hd h1').text() clas = d('div.a_Info span.a_catlog').text() source = d('div.a_Info span.a_source').text() time = d('div.a_Info span.a_time').text() body = d('div#Cnt-Main-Article-QQ p').text() print(time, ' ', clas, ' ', source, ' ', head) newhashid = hashlib.md5((head + time).encode()).hexdigest() print(body) #mongo updata class and source,
# In[1]: import os from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext import hyperloglog from concurrent.futures import Future from hdfs import Config import subprocess try: client = Config().get_client() except: config_fname = "hdfscli.cfg" with open(config_fname, "wt") as f: f.write(""" [global] default.alias = default [default.alias] url = http://mipt-master.atp-fivt.org:50070 user = {user} """.format(user=os.environ["USER"])) client = Config(config_fname).get_client() nn_address = subprocess.check_output('hdfs getconf -confKey dfs.namenode.http-address', shell=True).strip().decode("utf-8")
def main(argv): # Validamos entrada try: opts, args = getopt.getopt(argv, "hd:n:l:m:") except getopt.GetoptError: print( 'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\' ) print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ') print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]') sys.exit(2) for opt, arg in opts: if opt == '-h': print( 'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\' ) print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ') print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]') sys.exit() elif opt == "-s": C.images_index_url = arg elif opt == "-d": C.dir_classification = arg elif opt == "-n": C.numero_imagenes_proceso = int(arg) elif opt == "-l": C.lote_size = int(arg) elif opt == "-m": C.max_etiquetas = int(arg) print("Directorio : ", C.dir_classification) print("Num Imagenes a procesar: ", C.numero_imagenes_proceso) print("Imagenes por lote: ", C.lote_size) print("Max etiquetas a guardar: ", C.max_etiquetas) # *************************************************************************** # Inicio del proceso # *************************************************************************** global node_lookup_bc global model_data_bc # Iniciamos SparkContext print("Inicio: ", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')) sc = SparkContext( appName='Clasificacion MirFlickr con TensorFlow', pyFiles=[ '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/Constantes.py', '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/NodeLookup.py' ]) get_tensorflow_model() # Cargamos el modelo y lo distribuimos model_path = os.path.join(C.model_dir, 'classify_image_graph_def.pb') with tf.gfile.FastGFile(model_path, 'rb') as f: model_data = f.read() model_data_bc = sc.broadcast(model_data) # Distribuimos node lookup para ser utilizado en los workers node_lookup = NodeLookup().node_lookup node_lookup_bc = sc.broadcast(node_lookup) # Obtenemos una lista de las imágenes a procesar y las agrupamos en lotes servicio_imagenes = None try: servicio_imagenes = urllib.urlopen(C.images_index_url) except Exception as e: print(e) print("Servidor de imágenes no disponible") exit(404) imagenes = servicio_imagenes.read().split( '<li>')[2:C.numero_imagenes_proceso + 2] lote_imagenes = [ imagenes[i:i + C.lote_size] for i in range(0, len(imagenes), C.lote_size) ] # Paralelizamos los lotes de imagenes y procesamos rdd_imagenes = sc.parallelize(lote_imagenes).map( lambda x: map(obtener_nombre_imagen, x)) inception_rdd = rdd_imagenes.flatMap(procesar_lote_imagenes) # Borramos directorio categorias del hdfs por si existiera client = Config().get_client() client.delete('inception', recursive=True) # Salvamos los ficheros obtenidos en formato json. Para ello usamos un dataframe print("Procesamos:", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')) spark = SparkSession(sc) inception_df = inception_rdd.toDF() inception_df.write.json(C.dir_classification) print("Fin:", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
def main(): arg = parsing_options() krb_client = Config(path=arg.hdfsConf).get_client() az_conf = read_conf(arg.azureConf) az_client = az_key_vault_connection(az_conf['azure_client_id'], az_conf['azure_client_secret'], az_conf['azure_tenant_id']) az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'], az_conf['key_name']) column = list(itertools.chain.from_iterable(arg.column)) with krb_client.read(arg.input) as inputFile: with krb_client.write(arg.output, overwrite=arg.overwrite) as outputFile: if arg.operation == 'encrypt': aes_key = generate_aes_key() az_conf['uuid'] = str(uuid.uuid4()) encrypt_and_store_aes_key(az_client, az_conf, az_rsa_key['version'], base64.b64encode(aes_key)) df = pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header, dtype=str, chunksize=10000) num_chunk = 0 for chunk in df: # Generate new column name and hash in place new_column = [] for i in column: new_column.append(str(i) + '_HASH') chunk[new_column] = chunk[column].apply(hash_value) # Encrypt in place chunk[column] = chunk[column].apply(encrypt, args=(aes_key, az_conf['uuid'])) if num_chunk == 0: chunk.to_csv(outputFile, sep=arg.delimiter, header=True, index=False) num_chunk += 1 else: chunk.to_csv(outputFile, sep=arg.delimiter, header=False, index=False) else: df = pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header, dtype=str, chunksize=1000) num_chunk = 0 for chunk in df: if num_chunk == 0: # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0] key = base64.b64decode(chunk[column[0]].str.split( pat='-', n=3, expand=True)[3][0]) aes_key = retrieve_and_decrypt_aes_key( az_client, az_conf, az_rsa_key['version'], key) chunk[column] = chunk[column].apply(decrypt, args=(aes_key, )) if num_chunk == 0: chunk.to_csv(outputFile, sep=arg.delimiter, header=True, index=False) num_chunk += 1 else: chunk.to_csv(outputFile, sep=arg.delimiter, header=False, index=False)
def __init__(self, datasource): self.datasource = datasource self.client = Config().get_client("dev")