示例#1
0
    def __init__(self, sc, spark_session, uri, port):

        self.sc = sc
        self.spark_session = spark_session
        self.df = []
        self.models = []
        self.graphs = []
        self.base_path = uri + ":" + port
        self.local_pickle_path = os.path.dirname(
            os.path.realpath(__file__)) + '/../pickles/'

        self.pickle_path = '/user/hadoop/pickles/'
        self.model_path = '/user/hadoop/pickles/models/'

        self.dataset_path = self.pickle_path + "dataset/"
        self.private_release_path = self.dataset_path + "private/"
        self.anon_release_path = self.dataset_path + "github/"
        self.prod_release_path = self.dataset_path + "prod/"

        self.df_path = self.pickle_path + 'df/'
        self.graph_path = self.local_pickle_path + 'graphs/'
        self.labelled_df_path = self.df_path + 'labelled/'
        self.hdfs_client = Config().get_client('dev')

        self.load_df()
        self.load_models()
        self.load_graphs()
示例#2
0
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0
示例#3
0
    def __init__(self):
        self.client = Config().get_client('dev')

        try:
            self.client.list('datasets')
        except:
            self.client.makedirs('datasets')
示例#4
0
def main():
    client = Config(path=hdfscliconf).get_client()
    with client.read('/user/orenault/passwd') as input:
        #print input.read()
        df = pd.read_csv(input, sep=':', header=None)
        cols = df.iloc[:, 0]
        client.write('/user/orenault/data.avro',
                     cols.to_csv(sep=":", header=True, index=False),
                     overwrite=True)
 def __init__(self):
     self.client = Config().get_client('dev')
     self.prompt = 'homura_fs $ '
     self.name = None
     self.local_xml = None
     self.hdfs_xml = '.last_sync.xml'
     self.hdfs_loc_xml = None
     self.mount_root = None  #os.getcwd() + '/test'
     self.hdfs_root = '/cs219'
     self.meta = HomuraMeta()
     self.monitor = None
     if sys.platform.startswith('darwin'):
         logging.basicConfig(filename='mylog.log', level=logging.INFO)
         self.monitor = Monitor_Start()
示例#6
0
def main():

    conf = SparkConf().setAppName("binarize nifti")
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    parser = argparse.ArgumentParser(description='Binarize images')
    parser.add_argument('threshold', type=int, help="binarization threshold")
    parser.add_argument('folder_path',
                        type=str,
                        help='folder path containing all of the splits')
    parser.add_argument('output_path', type=str, help='output folder path')
    parser.add_argument('num',
                        type=int,
                        choices=[2, 4, 6, 8],
                        help='number of binarization operations')
    parser.add_argument('-m',
                        '--in_memory',
                        type=bool,
                        default=True,
                        help='in memory computation')

    args = parser.parse_args()

    nibRDD = sc.binaryFiles(args.folder_path)\
        .map(lambda x: get_data(x))

    client = Config().get_client('dev')

    if args.in_memory == 'True':
        print "Performing in-memory computations"

        for i in xrange(num - 1):
            nibRDD = nibRDD.map(lambda x: binarize(x, args.threshold))
        nibRDD = nibRDD.map(lambda x: binarize_and_save(
            x, args.threshold, args.output_path, client)).collect()

    else:
        print "Writing intermediary results to disk and loading from disk"

        binRDD = nibRDD.map(lambda x: binarize_and_save(
            x, args.threshold, args.output_path + "1", client)).collect()

        for i in xrange(num - 1):
            binRDD = sc.binaryFiles(args.output_path + "1")\
                         .map(lambda x: get_data(x))\
                         .map(lambda x: binarize_and_save(x, args.threshold, args.output_path + "1", client)).collect()
def main():
  arg = parsing_options()
  client = Config().get_client()
  with client.read(arg.input) as inputFile:
    # Load file in dataframe
    df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header)
  inputFile.closed

  # Open output file
  with client.write(arg.output, overwrite=arg.overwrite) as outputFile:
    
    # Flatten the list of columns
    column = list(itertools.chain.from_iterable(arg.column))
    # open RSA key
    key = get_key(arg.RSAkey,arg.operation)

    # Extract columns which need to be hashed / encrypted
    cols = df.iloc[:,column]
    colName = cols.columns

    if arg.operation == 'decrypt':
      # Do not forget the comma behind the privateRSA
      # the correct python grammer for a singleton tuple is (1,) not (1), 
      # which is just an expr wth the value 1.
      df[colName]=df[colName].apply(decrypt, args=(key,), axis=1)
      df.to_csv(outputFile, sep=":", header=True, index=False)
    else:
      # Encrypt then hash - as otherwise we encrypt the hash value
      # Call function encrypt w/ RSAkey - Axis=1 for row
      encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1)

      # Rename header to not clash when merging df + encrypted data frame
      new_column=[]
      #for i in cols.columns:
      for i in colName:
        new_column.append(str(i) + '_ENC')
      encrypted.columns = new_column
      
      # Concatenate both dataframe
      df = pd.concat([df, encrypted], axis=1)

      # Generate a hash
      df[colName] = df[colName].apply(hash_value).values
      
      # Write to file
      df.to_csv(outputFile, sep=":", header=True, index=False)
 def __init__(self,
              deviceInfoTableName,
              kind,
              dataBaseInfo,
              needFields="*",
              schema=None):
     self.dataBaseInfo = dataBaseInfo
     self.prefix = deviceInfoTableName
     self.kind = kind
     self.initDir = "/user/ct_fota/YangShuxuanNotDelete"
     self.iniFileName = self.kind + "/" + self.prefix + ".ini"
     self.needFields = needFields
     #self.initLog()
     self.connectDB()
     self.clientHDFS = Config().get_client()
     self.changtimes = 0
     self.schema = schema
示例#9
0
文件: hdfs.py 项目: qmac/grabbit
    def __init__(self,
                 path,
                 config=None,
                 dynamic_getters=False,
                 absolute_paths=True,
                 regex_search=False):
        """
        A container for all the files and metadata found at the specified path.
        Args:
            path (str): The root path of the layout.
            config (str): The path to the JSON config file that defines the
            entities and paths for the current layout.
            dynamic_getters (bool): If True, a get_{entity_name}() method will
                be dynamically added to the Layout every time a new Entity is
                created. This is implemented by creating a partial function of
                the get() function that sets the target argument to the
                entity name.
            absolute_paths (bool): If True, grabbit uses absolute file paths
                everywhere (including when returning query results). If False,
                the input path will determine the behavior (i.e., relative if
                a relative path was passed, absolute if an absolute path was
                passed).
            regex_search (bool): Whether to require exact matching (True)
                or regex search (False, default) when comparing the query
                string to each entity in .get() calls. This sets a default for
                the instance, but can be overridden in individual .get()
                requests.
        """
        self._hdfs_client = Config().get_client()

        path = abspath(path) if absolute_paths and self._hdfs_client is None \
            else path

        # Preprocess the config file
        if isinstance(config, six.string_types):
            config = '/'.join(config.strip('hdfs://').split('/')[1:])
            config = config.replace(self._hdfs_client.root[1:], '')
            with self._hdfs_client.read(config) as reader:
                config = json.load(reader)

        super(HDFSLayout, self).__init__(path, config, dynamic_getters,
                                         absolute_paths, regex_search)
示例#10
0
文件: binarize_fsl.py 项目: gkiar/sim
def main():

    conf = SparkConf().setAppName("binarize nifti")
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    parser = argparse.ArgumentParser(
        description='Binarize images using FSL installed in a Docker container'
    )
    parser.add_argument('threshold', type=int, help="binarization threshold")
    parser.add_argument('folder_path',
                        type=str,
                        help='folder path containing all of the splits')
    parser.add_argument('output_path', type=str, help='output folder path')

    args = parser.parse_args()

    print args.folder_path
    client = Config().get_client('dev')

    nibRDD = sc.binaryFiles(args.folder_path)\
        .map(lambda x: get_data(x))\
        .map(lambda x: binarize(x, args.threshold))\
        .map(lambda x: copy_to_hdfs(x, args.output_path, client)).collect()
示例#11
0
 def __init__(self, profile):
     self.client = Config().get_client(profile)
示例#12
0
#!/usr/bin/env python
# encoding: utf-8
"""Avro extension example."""

from hdfs import Config
from hdfs.ext.avro import AvroReader, AvroWriter

# Get the default alias' client.
client = Config().get_client()

# Some sample data.
records = [
    {
        'name': 'Ann',
        'age': 23
    },
    {
        'name': 'Bob',
        'age': 22
    },
]

# Write an Avro File to HDFS (since our records' schema is very simple, we let
# the writer infer it automatically, otherwise we would pass it as argument).
with AvroWriter(client, 'names.avro', overwrite=True) as writer:
    for record in records:
        writer.write(record)

# Read it back.
with AvroReader(client, 'names.avro') as reader:
    schema = reader.schema  # The inferred schema.
示例#13
0
def get_hdfs(alias='lake'):
    # https://hdfscli.readthedocs.io/en/latest/api.html
    from hdfs import Config
    client = Config().get_client(alias)
    return client
示例#14
0
 def __init__(self, debug=False):
     path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         '.hdfscli.cfg')
     self.client = Config(path).get_client()
     self.debug = debug
HDFS_OUTPUT_DIR = "/OUTPUT/"
HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000"

if __name__ == "__main__":

    # Folder creation for placing all the spark data
    cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/"
    os.system(cmd_a)

    # Configure Spark
    conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir",
                                                "/tmp/SPARK_PROCESS/")

    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    client = Config().get_client('bdrenhdfs')
    files = client.list(HDFS_RAWFILE_DIR)
    totalfilecount = len(files)

    if totalfilecount == 0:
        print("There is no files to be processed, application exiting...")
        sys.exit(0)

    filecount = 0

    for filename in files:
        print(filename)
        if filename.find("Covid_Analysis_DataSet.csv") >= 0:
            filecount = filecount + 1
            df_covid = sqlContext.read.format("csv").option(
                "delimiter",
示例#16
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib, urllib.request
from pyquery import PyQuery as pq
from mongoconnect import *
import hashlib

from hdfs import Config
client = Config().get_client('dev')

KEY_WORD = 'news'
exec('database=db_' + KEY_WORD)


def fetchData(item):
    request = urllib.request.Request(item['href'])
    result = urllib.request.urlopen(request, timeout=25)
    if result.code == 200 or 204:
        ts = str(result.read(), encoding='gbk')
        d = pq(ts)
        d = d('div#content')
        head = d('div.hd h1').text()
        clas = d('div.a_Info span.a_catlog').text()
        source = d('div.a_Info span.a_source').text()
        time = d('div.a_Info span.a_time').text()
        body = d('div#Cnt-Main-Article-QQ p').text()
        print(time, '  ', clas, '   ', source, '  ', head)
        newhashid = hashlib.md5((head + time).encode()).hexdigest()
        print(body)
        #mongo updata class and source,
示例#17
0
# In[1]:


import os
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

import hyperloglog
from concurrent.futures import Future

from hdfs import Config
import subprocess

try:
    client = Config().get_client()
except:
    config_fname = "hdfscli.cfg"
    with open(config_fname, "wt") as f:
        f.write("""
[global]
default.alias = default

[default.alias]
url = http://mipt-master.atp-fivt.org:50070
user = {user}
        """.format(user=os.environ["USER"]))
    client = Config(config_fname).get_client()


nn_address = subprocess.check_output('hdfs getconf -confKey dfs.namenode.http-address', shell=True).strip().decode("utf-8")
示例#18
0
def main(argv):
    # Validamos entrada
    try:
        opts, args = getopt.getopt(argv, "hd:n:l:m:")
    except getopt.GetoptError:
        print(
            'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\'
        )
        print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ')
        print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(
                'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\'
            )
            print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ')
            print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]')
            sys.exit()
        elif opt == "-s":
            C.images_index_url = arg
        elif opt == "-d":
            C.dir_classification = arg
        elif opt == "-n":
            C.numero_imagenes_proceso = int(arg)
        elif opt == "-l":
            C.lote_size = int(arg)
        elif opt == "-m":
            C.max_etiquetas = int(arg)

    print("Directorio :             ", C.dir_classification)
    print("Num Imagenes a procesar: ", C.numero_imagenes_proceso)
    print("Imagenes por lote:       ", C.lote_size)
    print("Max etiquetas a guardar: ", C.max_etiquetas)

    # ***************************************************************************
    # Inicio del proceso
    # ***************************************************************************
    global node_lookup_bc
    global model_data_bc

    # Iniciamos SparkContext
    print("Inicio: ",
          datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
    sc = SparkContext(
        appName='Clasificacion MirFlickr con TensorFlow',
        pyFiles=[
            '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/Constantes.py',
            '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/NodeLookup.py'
        ])
    get_tensorflow_model()

    # Cargamos el modelo y lo distribuimos
    model_path = os.path.join(C.model_dir, 'classify_image_graph_def.pb')
    with tf.gfile.FastGFile(model_path, 'rb') as f:
        model_data = f.read()
    model_data_bc = sc.broadcast(model_data)

    # Distribuimos node lookup para ser utilizado en los workers
    node_lookup = NodeLookup().node_lookup
    node_lookup_bc = sc.broadcast(node_lookup)

    # Obtenemos una lista de las imágenes a procesar y las agrupamos en lotes
    servicio_imagenes = None
    try:
        servicio_imagenes = urllib.urlopen(C.images_index_url)
    except Exception as e:
        print(e)
        print("Servidor de imágenes no disponible")
        exit(404)

    imagenes = servicio_imagenes.read().split(
        '<li>')[2:C.numero_imagenes_proceso + 2]
    lote_imagenes = [
        imagenes[i:i + C.lote_size]
        for i in range(0, len(imagenes), C.lote_size)
    ]

    # Paralelizamos los lotes de imagenes y procesamos
    rdd_imagenes = sc.parallelize(lote_imagenes).map(
        lambda x: map(obtener_nombre_imagen, x))
    inception_rdd = rdd_imagenes.flatMap(procesar_lote_imagenes)

    # Borramos directorio categorias del hdfs por si existiera
    client = Config().get_client()
    client.delete('inception', recursive=True)

    # Salvamos los ficheros obtenidos en formato json. Para ello usamos un dataframe
    print("Procesamos:",
          datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
    spark = SparkSession(sc)
    inception_df = inception_rdd.toDF()
    inception_df.write.json(C.dir_classification)
    print("Fin:", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
示例#19
0
def main():
    arg = parsing_options()
    krb_client = Config(path=arg.hdfsConf).get_client()
    az_conf = read_conf(arg.azureConf)
    az_client = az_key_vault_connection(az_conf['azure_client_id'],
                                        az_conf['azure_client_secret'],
                                        az_conf['azure_tenant_id'])
    az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'],
                                     az_conf['key_name'])
    column = list(itertools.chain.from_iterable(arg.column))
    with krb_client.read(arg.input) as inputFile:
        with krb_client.write(arg.output,
                              overwrite=arg.overwrite) as outputFile:
            if arg.operation == 'encrypt':
                aes_key = generate_aes_key()
                az_conf['uuid'] = str(uuid.uuid4())
                encrypt_and_store_aes_key(az_client, az_conf,
                                          az_rsa_key['version'],
                                          base64.b64encode(aes_key))
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=10000)
                num_chunk = 0
                for chunk in df:
                    # Generate new column name and hash in place
                    new_column = []
                    for i in column:
                        new_column.append(str(i) + '_HASH')
                    chunk[new_column] = chunk[column].apply(hash_value)
                    # Encrypt in place
                    chunk[column] = chunk[column].apply(encrypt,
                                                        args=(aes_key,
                                                              az_conf['uuid']))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
            else:
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=1000)
                num_chunk = 0
                for chunk in df:
                    if num_chunk == 0:
                        # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0]
                        key = base64.b64decode(chunk[column[0]].str.split(
                            pat='-', n=3, expand=True)[3][0])
                        aes_key = retrieve_and_decrypt_aes_key(
                            az_client, az_conf, az_rsa_key['version'], key)
                    chunk[column] = chunk[column].apply(decrypt,
                                                        args=(aes_key, ))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
示例#20
0
 def __init__(self, datasource):
     self.datasource = datasource
     self.client = Config().get_client("dev")