Пример #1
0
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0
Пример #2
0
    def __init__(self, sc, spark_session, uri, port):

        self.sc = sc
        self.spark_session = spark_session
        self.df = []
        self.models = []
        self.graphs = []
        self.base_path = uri + ":" + port
        self.local_pickle_path = os.path.dirname(
            os.path.realpath(__file__)) + '/../pickles/'

        self.pickle_path = '/user/hadoop/pickles/'
        self.model_path = '/user/hadoop/pickles/models/'

        self.dataset_path = self.pickle_path + "dataset/"
        self.private_release_path = self.dataset_path + "private/"
        self.anon_release_path = self.dataset_path + "github/"
        self.prod_release_path = self.dataset_path + "prod/"

        self.df_path = self.pickle_path + 'df/'
        self.graph_path = self.local_pickle_path + 'graphs/'
        self.labelled_df_path = self.df_path + 'labelled/'
        self.hdfs_client = Config().get_client('dev')

        self.load_df()
        self.load_models()
        self.load_graphs()
Пример #3
0
    def __init__(self):
        self.client = Config().get_client('dev')

        try:
            self.client.list('datasets')
        except:
            self.client.makedirs('datasets')
Пример #4
0
def main():
    client = Config(path=hdfscliconf).get_client()
    with client.read('/user/orenault/passwd') as input:
        #print input.read()
        df = pd.read_csv(input, sep=':', header=None)
        cols = df.iloc[:, 0]
        client.write('/user/orenault/data.avro',
                     cols.to_csv(sep=":", header=True, index=False),
                     overwrite=True)
Пример #5
0
class HDFSLayout(Layout):
    def __init__(self,
                 path,
                 config=None,
                 dynamic_getters=False,
                 absolute_paths=True,
                 regex_search=False):
        """
        A container for all the files and metadata found at the specified path.
        Args:
            path (str): The root path of the layout.
            config (str): The path to the JSON config file that defines the
            entities and paths for the current layout.
            dynamic_getters (bool): If True, a get_{entity_name}() method will
                be dynamically added to the Layout every time a new Entity is
                created. This is implemented by creating a partial function of
                the get() function that sets the target argument to the
                entity name.
            absolute_paths (bool): If True, grabbit uses absolute file paths
                everywhere (including when returning query results). If False,
                the input path will determine the behavior (i.e., relative if
                a relative path was passed, absolute if an absolute path was
                passed).
            regex_search (bool): Whether to require exact matching (True)
                or regex search (False, default) when comparing the query
                string to each entity in .get() calls. This sets a default for
                the instance, but can be overridden in individual .get()
                requests.
        """
        self._hdfs_client = Config().get_client()

        path = abspath(path) if absolute_paths and self._hdfs_client is None \
            else path

        # Preprocess the config file
        if isinstance(config, six.string_types):
            config = '/'.join(config.strip('hdfs://').split('/')[1:])
            config = config.replace(self._hdfs_client.root[1:], '')
            with self._hdfs_client.read(config) as reader:
                config = json.load(reader)

        super(HDFSLayout, self).__init__(path, config, dynamic_getters,
                                         absolute_paths, regex_search)

    def _get_files(self):
        self.root = '/'.join(
            self.root.strip('hdfs://').split('/')[1:]).replace(
                self._hdfs_client.root[1:], '')
        return self._hdfs_client.walk(self.root)

    def _make_file_object(self, root, f):
        filepath = str(psp.join(root, f))
        with self._hdfs_client.read(filepath):
            return File(filepath)
	def read(cls, file_path):
		lines = []

		try:
			client = Config().get_client('dev')

			with client.read(file_path, encoding='utf-8', delimiter='\n') as reader:
				for line in reader:
					lines.append(line)  # eventuell unnoetig, kann man auch reader zurueckgeben?
		except:
			print("ERROR: Could not read from HDFS.")
			raise

		return lines
 def __init__(self):
     self.client = Config().get_client('dev')
     self.prompt = 'homura_fs $ '
     self.name = None
     self.local_xml = None
     self.hdfs_xml = '.last_sync.xml'
     self.hdfs_loc_xml = None
     self.mount_root = None  #os.getcwd() + '/test'
     self.hdfs_root = '/cs219'
     self.meta = HomuraMeta()
     self.monitor = None
     if sys.platform.startswith('darwin'):
         logging.basicConfig(filename='mylog.log', level=logging.INFO)
         self.monitor = Monitor_Start()
def main():
  arg = parsing_options()
  client = Config().get_client()
  with client.read(arg.input) as inputFile:
    # Load file in dataframe
    df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header)
  inputFile.closed

  # Open output file
  with client.write(arg.output, overwrite=arg.overwrite) as outputFile:
    
    # Flatten the list of columns
    column = list(itertools.chain.from_iterable(arg.column))
    # open RSA key
    key = get_key(arg.RSAkey,arg.operation)

    # Extract columns which need to be hashed / encrypted
    cols = df.iloc[:,column]
    colName = cols.columns

    if arg.operation == 'decrypt':
      # Do not forget the comma behind the privateRSA
      # the correct python grammer for a singleton tuple is (1,) not (1), 
      # which is just an expr wth the value 1.
      df[colName]=df[colName].apply(decrypt, args=(key,), axis=1)
      df.to_csv(outputFile, sep=":", header=True, index=False)
    else:
      # Encrypt then hash - as otherwise we encrypt the hash value
      # Call function encrypt w/ RSAkey - Axis=1 for row
      encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1)

      # Rename header to not clash when merging df + encrypted data frame
      new_column=[]
      #for i in cols.columns:
      for i in colName:
        new_column.append(str(i) + '_ENC')
      encrypted.columns = new_column
      
      # Concatenate both dataframe
      df = pd.concat([df, encrypted], axis=1)

      # Generate a hash
      df[colName] = df[colName].apply(hash_value).values
      
      # Write to file
      df.to_csv(outputFile, sep=":", header=True, index=False)
Пример #9
0
def main():

    conf = SparkConf().setAppName("binarize nifti")
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    parser = argparse.ArgumentParser(description='Binarize images')
    parser.add_argument('threshold', type=int, help="binarization threshold")
    parser.add_argument('folder_path',
                        type=str,
                        help='folder path containing all of the splits')
    parser.add_argument('output_path', type=str, help='output folder path')
    parser.add_argument('num',
                        type=int,
                        choices=[2, 4, 6, 8],
                        help='number of binarization operations')
    parser.add_argument('-m',
                        '--in_memory',
                        type=bool,
                        default=True,
                        help='in memory computation')

    args = parser.parse_args()

    nibRDD = sc.binaryFiles(args.folder_path)\
        .map(lambda x: get_data(x))

    client = Config().get_client('dev')

    if args.in_memory == 'True':
        print "Performing in-memory computations"

        for i in xrange(num - 1):
            nibRDD = nibRDD.map(lambda x: binarize(x, args.threshold))
        nibRDD = nibRDD.map(lambda x: binarize_and_save(
            x, args.threshold, args.output_path, client)).collect()

    else:
        print "Writing intermediary results to disk and loading from disk"

        binRDD = nibRDD.map(lambda x: binarize_and_save(
            x, args.threshold, args.output_path + "1", client)).collect()

        for i in xrange(num - 1):
            binRDD = sc.binaryFiles(args.output_path + "1")\
                         .map(lambda x: get_data(x))\
                         .map(lambda x: binarize_and_save(x, args.threshold, args.output_path + "1", client)).collect()
Пример #10
0
 def __init__(self,
              deviceInfoTableName,
              kind,
              dataBaseInfo,
              needFields="*",
              schema=None):
     self.dataBaseInfo = dataBaseInfo
     self.prefix = deviceInfoTableName
     self.kind = kind
     self.initDir = "/user/ct_fota/YangShuxuanNotDelete"
     self.iniFileName = self.kind + "/" + self.prefix + ".ini"
     self.needFields = needFields
     #self.initLog()
     self.connectDB()
     self.clientHDFS = Config().get_client()
     self.changtimes = 0
     self.schema = schema
Пример #11
0
def main():

    conf = SparkConf().setAppName("binarize nifti")
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    parser = argparse.ArgumentParser(
        description='Binarize images using FSL installed in a Docker container'
    )
    parser.add_argument('threshold', type=int, help="binarization threshold")
    parser.add_argument('folder_path',
                        type=str,
                        help='folder path containing all of the splits')
    parser.add_argument('output_path', type=str, help='output folder path')

    args = parser.parse_args()

    print args.folder_path
    client = Config().get_client('dev')

    nibRDD = sc.binaryFiles(args.folder_path)\
        .map(lambda x: get_data(x))\
        .map(lambda x: binarize(x, args.threshold))\
        .map(lambda x: copy_to_hdfs(x, args.output_path, client)).collect()
Пример #12
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib, urllib.request
from pyquery import PyQuery as pq
from mongoconnect import *
import hashlib

from hdfs import Config
client = Config().get_client('dev')

KEY_WORD = 'news'
exec('database=db_' + KEY_WORD)


def fetchData(item):
    request = urllib.request.Request(item['href'])
    result = urllib.request.urlopen(request, timeout=25)
    if result.code == 200 or 204:
        ts = str(result.read(), encoding='gbk')
        d = pq(ts)
        d = d('div#content')
        head = d('div.hd h1').text()
        clas = d('div.a_Info span.a_catlog').text()
        source = d('div.a_Info span.a_source').text()
        time = d('div.a_Info span.a_time').text()
        body = d('div#Cnt-Main-Article-QQ p').text()
        print(time, '  ', clas, '   ', source, '  ', head)
        newhashid = hashlib.md5((head + time).encode()).hexdigest()
        print(body)
        #mongo updata class and source,
Пример #13
0
 def __init__(self, profile):
     self.client = Config().get_client(profile)
Пример #14
0
# In[1]:


import os
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

import hyperloglog
from concurrent.futures import Future

from hdfs import Config
import subprocess

try:
    client = Config().get_client()
except:
    config_fname = "hdfscli.cfg"
    with open(config_fname, "wt") as f:
        f.write("""
[global]
default.alias = default

[default.alias]
url = http://mipt-master.atp-fivt.org:50070
user = {user}
        """.format(user=os.environ["USER"]))
    client = Config(config_fname).get_client()


nn_address = subprocess.check_output('hdfs getconf -confKey dfs.namenode.http-address', shell=True).strip().decode("utf-8")
Пример #15
0
schema_Freq_DF = typ.StructType([
    typ.StructField("CHROM", typ.IntegerType(), False),
    typ.StructField("POS", typ.IntegerType(), False),
    typ.StructField("N_ALLELES", typ.IntegerType(), False),
    typ.StructField("N_CHR", typ.IntegerType(), False),
    typ.StructField("ALLELE_FREQ_1", typ.StringType(), False),
    typ.StructField("ALLELE_FREQ_2", typ.StringType(), False),
    typ.StructField("ID", typ.StringType(), True),
])

###############
### Setting up File Paths and Lists
###############

client = Config().get_client('dev')

workingFolder_Indian = "SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/"

workingFolder_Malay = "SgMalay_vcf/2012_05/snps/"

workingFolder_Chinese = "1000G_CDX/Phase3/integrated/"

# Filing number of unique samples found in the working folder...

freqFiles_Indian = [
    f for f in client.list(workingFolder_Indian)
    if re.match(r'chr\d+_analysis_exome\.frq', f)
]
rsIDFiles_Indian = [
    f for f in client.list(workingFolder_Indian)
Пример #16
0
# encoding: utf-8

"""Sample HdfsCLI script.

This example shows how to write files to HDFS, read them back, and perform a
few other simple filesystem operations.

"""

from hdfs import Config
from json import dump, load


# Get the default alias' client. (See the quickstart section in the
# documentation to learn more about this.)
client = Config().get_client()

# Some fake data that we are interested in uploading to HDFS.
model = {
  '(intercept)': 48.,
  'first_feature': 2.,
  'second_feature': 12.,
}

# First, we delete any existing `models/` folder on HDFS.
client.delete('models', recursive=True)

# We can now upload the data, first as CSV.
with client.write('models/1.csv', encoding='utf-8') as writer:
  for item in model.items():
    writer.write(u'%s,%s\n' % item)
class HomuraFS():
    def __init__(self):
        self.client = Config().get_client('dev')
        self.prompt = 'homura_fs $ '
        self.name = None
        self.local_xml = None
        self.hdfs_xml = '.last_sync.xml'
        self.hdfs_loc_xml = None
        self.mount_root = None  #os.getcwd() + '/test'
        self.hdfs_root = '/cs219'
        self.meta = HomuraMeta()
        self.monitor = None
        if sys.platform.startswith('darwin'):
            logging.basicConfig(filename='mylog.log', level=logging.INFO)
            self.monitor = Monitor_Start()

    def shell_loop(self):
        while True:
            cmd = raw_input(self.prompt)

            if cmd == 'sync':
                print "Current devices attached:"
                id_mapping = dict()
                count = 1

                if len(self.monitor.devs) == 0:
                    print "No device attached"
                    continue
                for dev in self.monitor.devs:
                    #print dev
                    devname = dev['Dname']
                    manufacture = dev['Man']
                    hname = dev['Hname']
                    id_mapping[count] = dev
                    print "{}) Dname: {}, Hname: {}, Manufacture: {}.\n".format(
                        count, devname, hname, manufacture)
                    count += 1
                dev_id = int(raw_input("Which device to sync:\n"))

                if dev_id == 0:
                    continue

                if dev_id in id_mapping:
                    #self.name = id_mapping[dev_id]['UID']
                    self.name = ''
                    self.mount_root = id_mapping[dev_id]['Path']
                    self.local_xml = self.mount_root + '/.last_sync.xml'
                    self.hdfs_loc_xml = self.mount_root + '/.cur_hdfs.xml'
                    self.meta.myRootpath = self.mount_root

                    log('Mount root is ' + self.mount_root)
                    log('Device xml file is ' + self.local_xml)
                    log('HDFS xml file is ' + self.hdfs_xml)
                    log('Copy of HDFS xml stored at ' + self.hdfs_loc_xml)
                    log('Syncing files for device ' +
                        id_mapping[dev_id]['Dname'])
                    self.sync_files()
                else:
                    pass

            elif cmd == 'test':
                pass
                #log('Setting up test directory with default config')
                #self.__test()
            elif cmd == 'download':
                pass
            elif cmd == 'quit':
                if self.monitor:
                    Monitor_Stop(self.monitor)
                return

    def download_all(self):
        log('Downloading all files from HDFS to local device')
        try:
            self.create_file(self.mount_root, self.hdfs_root, 1)
            for dir_or_file in os.listdir(self.mount_root + self.hdfs_root):
                if not dir_or_file.startswith('.'):
                    shutil.move(
                        self.mount_root + self.hdfs_root + '/' + dir_or_file,
                        self.mount_root)
            shutil.rmtree(self.mount_root + self.hdfs_root)
        except:
            log('Something went wrog while downloading files')
            try:
                shutil.rmtree(self.mount_root + self.hdfs_root)
            except:
                pass

        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')

    def upload_all(self):
        log('Uploading all files from local device to HDFS')

        for dir_or_file in os.listdir(self.mount_root):
            if not dir_or_file.startswith('.'):
                try:
                    log('Uploading to ' + self.hdfs_root + '/' + dir_or_file)
                    self.client.upload(self.hdfs_root + '/' + dir_or_file,
                                       self.mount_root + '/' + dir_or_file,
                                       n_threads=0)
                except:
                    log('Warning: could not upload')

    def load_HDFS_XML(self):
        log("Attempting to fetch HDFS xml")
        self.update_file(self.hdfs_loc_xml, self.hdfs_xml, 1)
        log("Loading HDFS xml")
        self.meta.loadHDFSXml(self.hdfs_loc_xml)
        os.remove(self.hdfs_loc_xml)

    def sync_files(self):
        # check if we have an old snapshot xml
        if not os.path.isfile(
                self.local_xml
        ):  # snapshot doesn't exist, so download everything
            log("No local snapshot file was found at " + self.local_xml)
            self.meta.Snapshotdoc = self.meta.emptyXml()  # use empty
            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()

            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        else:
            log("Fetching local snapshot xml from " + self.local_xml)
            self.meta.loadSnapshotXml(self.local_xml)

            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()
            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        self.meta.path2Xml(self.mount_root)
        self.meta.mydoc = self.meta.tempdoc

        #print 'HDFS XML:'
        #self.meta.showHDFSXml()
        #print '---\nSnapshot Xml'
        #self.meta.showSnapshotXml()
        #print '---\nLocal Xml'
        #self.meta.showMyXml()

        # find operations since last sync
        (my_creates, my_deletes, my_modifies, hdfs_creates, hdfs_deletes,
         hdfs_modifies) = self.meta.getOperations()

        root = self.mount_root
        name = self.hdfs_root

        # apply operations on current device
        for path in my_creates:
            if path.endswith('/'):  # path is a folder we want to create
                os.makedirs(root + path)
            else:
                self.create_file(root + path, name + path, 1)
        for path in my_modifies:
            self.update_file(root + path, name + path, 1)
        for path in my_deletes:
            self.delete_file(root + path, 1)

        # apply operations on HDFS
        for path in hdfs_creates:
            if path.endswith('/'):  # path is a folder we want to create
                self.client.makedirs(name + path)
            else:
                self.create_file(root + path, name + path, 0)
        for path in hdfs_modifies:
            self.update_file(root + path, name + path, 0)
        for path in hdfs_deletes:
            self.delete_file(name + path, 0)

        # update last sync for both HDFS and current device
        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')
        self.update_file(self.local_xml, self.hdfs_xml, 0)

        return

    # in this set of functions, when kyuubey = 0, the operation goes
    # from loc to hdfs (i.e. local becomes the "master")
    # when kyuubey = 1, the operation goes from hdfs to loc
    # (i.e. hdfs becomes the "master")
    def create_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:
            log('Creating ' + hdfs_path + ' on HDFS')
            self.client.upload(hdfs_path, loc_path, n_threads=0)
        elif kyuubey == 1:
            log('Creating ' + loc_path + ' locally')
            self.client.download(hdfs_path, loc_path, n_threads=0)

    def update_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:  # updating file on HDFS
            log('Updating file ' + hdfs_path + ' on HDFS')
            with open(loc_path) as reader:
                with self.client.write(hdfs_path, overwrite=True) as writer:
                    for line in reader:
                        writer.write(line)
        elif kyuubey == 1:
            log('Updating file ' + loc_path + ' locally')
            with open(loc_path, 'w') as writer:
                with self.client.read(hdfs_path) as reader:
                    data = reader.read()
                    writer.write(data)

    def delete_file(self, path, kyuubey):
        if kyuubey == 0:  # delete file on HDFS
            log('Deleting file ' + path + ' from HDFS')
            self.client.delete(path, recursive=True)
        elif kyuubey == 1:  # delete file locally
            log('Deleting file ' + path + ' locally')
            os.remove(path)

    def move_file(self, src_path, dst_path, kyuubey):
        if kyuubey == 0:  # move file on HDFS
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' on HDFS')
            self.client.rename(src_path, dst_path)
        elif kyuubey == 1:  # move file locally
            os.rename(src_path, dst_path)
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' locally')

    def __test(self, test_no=1):
        self.__reset_test()
        if test_no == 1:
            self.__config_basic()
        elif test_no == 2:
            self.__config_outer_empty()

    def __reset_test(self):
        root = self.mount_root
        log('Resetting mount directory')
        if os.path.exists(root):
            shutil.rmtree(root)
        os.makedirs(root)

    def __config_basic(self):
        root = self.mount_root
        log('Config 1: default')
        with open(root + '/test1.txt', 'w') as writer:
            writer.write('hi\nthere\n!\n')
        with open(root + '/test2.txt', 'w') as writer:
            writer.write('one-liner')
        with open(root + '/test3.txt', 'w') as writer:
            writer.write('')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')

    def __config_outer_empty(self):
        root = self.mount_root
        log('Config 2: outer directory empty')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')
Пример #18
0
class Pickler:
    def __init__(self, sc, spark_session, uri, port):

        self.sc = sc
        self.spark_session = spark_session
        self.df = []
        self.models = []
        self.graphs = []
        self.base_path = uri + ":" + port
        self.local_pickle_path = os.path.dirname(
            os.path.realpath(__file__)) + '/../pickles/'

        self.pickle_path = '/user/hadoop/pickles/'
        self.model_path = '/user/hadoop/pickles/models/'

        self.dataset_path = self.pickle_path + "dataset/"
        self.private_release_path = self.dataset_path + "private/"
        self.anon_release_path = self.dataset_path + "github/"
        self.prod_release_path = self.dataset_path + "prod/"

        self.df_path = self.pickle_path + 'df/'
        self.graph_path = self.local_pickle_path + 'graphs/'
        self.labelled_df_path = self.df_path + 'labelled/'
        self.hdfs_client = Config().get_client('dev')

        self.load_df()
        self.load_models()
        self.load_graphs()

    #TODO: Implement generic methods for read dataset / model ONLY
    def read(self):
        pass

    def save(self):
        pass

    def getLabelledFiles(self):
        return self.hdfs_client.list(self.prod_release_path)

    def readCSVToDF(self, date, folder):
        return self.spark_session.read.option(
            "header",
            True).csv(self.base_path + self.dataset_path + folder + "/" + date)

    def getLabelledTelemetry(self):
        return self.hdfs_client.list(self.private_release_path)

    def existsModel(self, name):
        res = self.hdfs_client.list(self.model_path)
        file_extension = '.model'
        if name + file_extension in res:
            return True

    def getModel(self, name):
        return PipelineModel.load(self.base_path + self.model_path + name +
                                  ".model")

    def isDateLabelled(self, date):
        res = self.hdfs_client.list(self.prod_release_path)
        file_extension = ".csv"

        if date + file_extension in res:
            return True

        return False

    def load_graphs(self):
        for file in os.listdir(self.graph_path):

            if file.endswith(".pickle"):
                self.graphs.append(file[:-7])

    def existsGraph(self, date):
        if date in self.graphs:
            return True
        return False

    def getGraph(self, date):
        if date in self.graphs:
            with open(self.graph_path + date + ".pickle", 'rb') as pickle_file:
                content = pickle.load(pickle_file)

                return content

    def saveGraph(self, G, date):
        if date in self.graphs:
            return False

        nx.write_gpickle(G, self.graph_path + date + ".pickle")
        self.graphs.append(date)

    def existsDF(self, date, source):

        #2020.03.01_joy
        hash = self.getHash(date, source)
        if hash in self.df:
            return True

        return False

    def load_df(self):

        #Load Joy Data
        res = self.hdfs_client.list(self.df_path + 'joy')
        # print(f"Joy Items in directory: {res}")
        for file in res:

            if file.endswith(".parquet"):

                self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest())

        #Load graph features DF
        res = self.hdfs_client.list(self.df_path + 'graph')
        #  print(f"Graph DF Items in directory: {res}")
        for file in res:

            if file.endswith(".parquet"):

                self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest())

        res = self.hdfs_client.list(self.df_path + 'labelled')
        #   print(f"Labelled Items in directory: {res}")
        for file in res:

            if file.endswith(".parquet"):

                self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest())

        # TODO : Load others?

    def saveModel(self, model, name):
        model.save(self.base_path + self.model_path + name + ".model")

    def load_models(self):
        res = self.hdfs_client.list(self.model_path)
        for file in res:
            if file.endswith(".model"):
                self.models.append(file.split('.')[0])

    def saveDFToCSV(self, df, date, folder, coalesced=False):

        if coalesced:
            df.coalesce(1).write.csv(self.base_path + self.pickle_path +
                                     "dataset/" + folder + '/' + date + '.csv',
                                     header=True)
        else:
            df.write.csv(self.base_path + self.pickle_path + "dataset/" +
                         folder + '/' + date + '.csv',
                         header=True)
            df.write.parquet(self.base_path + self.pickle_path + "dataset/" +
                             folder + '/' + date + '.parquet')

    def saveDF(self, df, date, source):
        hash = self.getHash(date, source)
        if hash in self.df:
            return False

        else:
            df.write.parquet(self.base_path + self.df_path + source + '/' +
                             date + "_" + source + '.parquet')
            self.df.append(hash)

    def getDF(self, date, source):

        hash = self.getHash(date, source)

        if hash in self.df:

            df = self.spark_session.read.parquet(self.base_path +
                                                 self.df_path + source + '/' +
                                                 date + "_" + source +
                                                 '.parquet')
            return df

        return False

    def getHash(self, date, source):

        id = date + "_" + source
        hash = sha256(id.encode('utf-8')).hexdigest()
        return hash
HDFS_OUTPUT_DIR = "/OUTPUT/"
HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000"

if __name__ == "__main__":

    # Folder creation for placing all the spark data
    cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/"
    os.system(cmd_a)

    # Configure Spark
    conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir",
                                                "/tmp/SPARK_PROCESS/")

    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    client = Config().get_client('bdrenhdfs')
    files = client.list(HDFS_RAWFILE_DIR)
    totalfilecount = len(files)

    if totalfilecount == 0:
        print("There is no files to be processed, application exiting...")
        sys.exit(0)

    filecount = 0

    for filename in files:
        print(filename)
        if filename.find("Covid_Analysis_DataSet.csv") >= 0:
            filecount = filecount + 1
            df_covid = sqlContext.read.format("csv").option(
                "delimiter",
Пример #20
0
def get_hdfs(alias='lake'):
    # https://hdfscli.readthedocs.io/en/latest/api.html
    from hdfs import Config
    client = Config().get_client(alias)
    return client
Пример #21
0
from hdfs import Config
from sys import argv
from math import ceil

script, filename = argv

client = Config().get_client()

status = client.status(filename)

print(ceil(status['length'] / status['blockSize']))


Пример #22
0
 def __init__(self, debug=False):
     path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         '.hdfscli.cfg')
     self.client = Config(path).get_client()
     self.debug = debug
Пример #23
0
class HadoopWebExplorer:
    def __init__(self, debug=False):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.hdfscli.cfg')
        self.client = Config(path).get_client()
        self.debug = debug

    def print(self, *args):
        if self.debug:
            print(*args)

    def path_exists(self, path):
        """
        Checks whether such path already exists
        :param path: path to check
        :type path: unicode
        :return: boolean flag indicating whether path already exists or not
        :rtype: bool
        """
        return self.client.status(path, strict=False) is not None

    @catch_hdfs_error
    def create_folder(self, folder_name):
        """
        Creates folder with the given name if it does not exist
        :param folder_name: the name of the folder we want to add
        :type folder_name: unicode
        :return: returns true if created folder or it already exists, otherwise false
        :rtype: bool
        """
        if self.path_exists(folder_name):
            print(f'Folder already exists: {folder_name}')
            return True

        self.print(f'Folder does not exist: {folder_name}')
        self.client.makedirs(folder_name)
        self.print(f'Folder created: {folder_name}')

    @catch_hdfs_error
    def write_to_file(self,
                      folder_name,
                      file_name,
                      data,
                      overwrite=False,
                      append=False):
        """
        Writes provided data into file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be written to
        :type file_name: unicode
        :param data: data to be written
        :type data: unicode
        :param overwrite: overwrite any existing file or directory
        :type overwrite: bool
        :param append: append to a file rather than create a new one.
        :type append: bool
        :return: returns true if it successfully wrote the data, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        if append and not self.path_exists(path):
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite)
        else:
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite,
                              append=append)
        self.print("Written data to HDFS file")

    @catch_hdfs_error
    def read_from_file(self, folder_name, file_name):
        """
        Reads from file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be read from
        :type file_name: unicode
        """
        path = os.path.join(folder_name, file_name)
        if not self.path_exists(path):
            self.print(f'File does not exists: {path}')
            return None
        return self.client.read(path)

    @catch_hdfs_error
    def delete_file(self, folder_name, file_name):
        """
        Deletes file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file to be deleted
        :type file_name: unicode
        :return: returns true if it successfully deleted the file, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        return self.client.delete(path)

    @catch_hdfs_error
    def delete_folder(self, folder_name):
        """
        Deletes the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :return: returns true if it successfully deleted the folder, otherwise false
        :rtype: bool
        """
        return self.client.delete(folder_name, recursive=True)

    @catch_hdfs_error
    def explore_folder(self, folder_name):
        """
        Explores the specified folder
        :param folder_name: name of the folder to be observed
        :type folder_name: unicode
        """
        if not self.path_exists(folder_name):
            self.print(f'Folder does not exists: {folder_name}')
        self.print(f'Exploring folder: {folder_name}')
        for path, dirs, files in self.client.walk(folder_name, status=True):
            for file in files:
                block_size = file[1]['blockSize']
                size = file[1]['length']
                owner = file[1]['owner']
                self.print(
                    f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}'
                )
Пример #24
0
from hdfs import Config, InsecureClient
import cPickle as pickle
from tuple import Tuple

client = Config().get_client('dev')
client.write('a/p', 'aaa', overwrite=True)
print client.status('a')
Пример #25
0
def main():
    arg = parsing_options()
    krb_client = Config(path=arg.hdfsConf).get_client()
    az_conf = read_conf(arg.azureConf)
    az_client = az_key_vault_connection(az_conf['azure_client_id'],
                                        az_conf['azure_client_secret'],
                                        az_conf['azure_tenant_id'])
    az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'],
                                     az_conf['key_name'])
    column = list(itertools.chain.from_iterable(arg.column))
    with krb_client.read(arg.input) as inputFile:
        with krb_client.write(arg.output,
                              overwrite=arg.overwrite) as outputFile:
            if arg.operation == 'encrypt':
                aes_key = generate_aes_key()
                az_conf['uuid'] = str(uuid.uuid4())
                encrypt_and_store_aes_key(az_client, az_conf,
                                          az_rsa_key['version'],
                                          base64.b64encode(aes_key))
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=10000)
                num_chunk = 0
                for chunk in df:
                    # Generate new column name and hash in place
                    new_column = []
                    for i in column:
                        new_column.append(str(i) + '_HASH')
                    chunk[new_column] = chunk[column].apply(hash_value)
                    # Encrypt in place
                    chunk[column] = chunk[column].apply(encrypt,
                                                        args=(aes_key,
                                                              az_conf['uuid']))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
            else:
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=1000)
                num_chunk = 0
                for chunk in df:
                    if num_chunk == 0:
                        # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0]
                        key = base64.b64decode(chunk[column[0]].str.split(
                            pat='-', n=3, expand=True)[3][0])
                        aes_key = retrieve_and_decrypt_aes_key(
                            az_client, az_conf, az_rsa_key['version'], key)
                    chunk[column] = chunk[column].apply(decrypt,
                                                        args=(aes_key, ))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
Пример #26
0
#! /user/bin/env python3

from hdfs import Config
import sys

client = Config().get_client()
filename = sys.argv[1]
with client.read(filename) as reader:
    ans = reader.read(10)
    print(ans.decode())
Пример #27
0
def main(argv):
    # Validamos entrada
    try:
        opts, args = getopt.getopt(argv, "hd:n:l:m:")
    except getopt.GetoptError:
        print(
            'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\'
        )
        print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ')
        print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(
                'usage: spark-submit \\ \n --master <master> \\ \n <path>/ClasificacionImagenes.py \\'
            )
            print(' [-d <directorio_salida] [-n <numero_imagenes>] \\ \n ')
            print(' [-l <tamaño_lote] [-m max_etiquetas] [-s url_images]')
            sys.exit()
        elif opt == "-s":
            C.images_index_url = arg
        elif opt == "-d":
            C.dir_classification = arg
        elif opt == "-n":
            C.numero_imagenes_proceso = int(arg)
        elif opt == "-l":
            C.lote_size = int(arg)
        elif opt == "-m":
            C.max_etiquetas = int(arg)

    print("Directorio :             ", C.dir_classification)
    print("Num Imagenes a procesar: ", C.numero_imagenes_proceso)
    print("Imagenes por lote:       ", C.lote_size)
    print("Max etiquetas a guardar: ", C.max_etiquetas)

    # ***************************************************************************
    # Inicio del proceso
    # ***************************************************************************
    global node_lookup_bc
    global model_data_bc

    # Iniciamos SparkContext
    print("Inicio: ",
          datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
    sc = SparkContext(
        appName='Clasificacion MirFlickr con TensorFlow',
        pyFiles=[
            '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/Constantes.py',
            '/home/utad/TFM/Fuentes/TensorFlowMirFlickr/NodeLookup.py'
        ])
    get_tensorflow_model()

    # Cargamos el modelo y lo distribuimos
    model_path = os.path.join(C.model_dir, 'classify_image_graph_def.pb')
    with tf.gfile.FastGFile(model_path, 'rb') as f:
        model_data = f.read()
    model_data_bc = sc.broadcast(model_data)

    # Distribuimos node lookup para ser utilizado en los workers
    node_lookup = NodeLookup().node_lookup
    node_lookup_bc = sc.broadcast(node_lookup)

    # Obtenemos una lista de las imágenes a procesar y las agrupamos en lotes
    servicio_imagenes = None
    try:
        servicio_imagenes = urllib.urlopen(C.images_index_url)
    except Exception as e:
        print(e)
        print("Servidor de imágenes no disponible")
        exit(404)

    imagenes = servicio_imagenes.read().split(
        '<li>')[2:C.numero_imagenes_proceso + 2]
    lote_imagenes = [
        imagenes[i:i + C.lote_size]
        for i in range(0, len(imagenes), C.lote_size)
    ]

    # Paralelizamos los lotes de imagenes y procesamos
    rdd_imagenes = sc.parallelize(lote_imagenes).map(
        lambda x: map(obtener_nombre_imagen, x))
    inception_rdd = rdd_imagenes.flatMap(procesar_lote_imagenes)

    # Borramos directorio categorias del hdfs por si existiera
    client = Config().get_client()
    client.delete('inception', recursive=True)

    # Salvamos los ficheros obtenidos en formato json. Para ello usamos un dataframe
    print("Procesamos:",
          datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
    spark = SparkSession(sc)
    inception_df = inception_rdd.toDF()
    inception_df.write.json(C.dir_classification)
    print("Fin:", datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S'))
Пример #28
0
#!/usr/bin/env python
# encoding: utf-8
"""Avro extension example."""

from hdfs import Config
from hdfs.ext.avro import AvroReader, AvroWriter

# Get the default alias' client.
client = Config().get_client()

# Some sample data.
records = [
    {
        'name': 'Ann',
        'age': 23
    },
    {
        'name': 'Bob',
        'age': 22
    },
]

# Write an Avro File to HDFS (since our records' schema is very simple, we let
# the writer infer it automatically, otherwise we would pass it as argument).
with AvroWriter(client, 'names.avro', overwrite=True) as writer:
    for record in records:
        writer.write(record)

# Read it back.
with AvroReader(client, 'names.avro') as reader:
    schema = reader.schema  # The inferred schema.
Пример #29
0
class SparkHDFSClient(object):
    def __init__(self, datasource):
        self.datasource = datasource
        self.client = Config().get_client("dev")

    def get_file_list(self, folder):
        files = self.client.list(folder.strip())
        files = [folder + '/' + file for file in files]
        return files

    def list_collections(self):
        results = []
        status = self.client.status(self.datasource.url, strict=False)
        print(status, self.datasource.url)
        if status is not None:
            if status['type'] == "DIRECTORY":
                files = self.get_file_list(self.datasource.url)
                while len(files) > 0:
                    file = files.pop()
                    status = self.client.status(os.path.join(
                        self.datasource.url, file),
                                                strict=False)
                    if status is None:
                        continue
                    if status['type'] == "DIRECTORY":
                        subfiles = self.get_file_list(
                            os.path.join(self.datasource.url, file))
                        files.extend(subfiles)
                        continue
                    else:
                        if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \
                                or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\
                                or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\
                                or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json':
                            continue
                        row = {
                            "db":
                            file[:file.rfind('/')]
                            if '/' in file else self.datasource.url,
                            "document":
                            file[file.rfind('/') +
                                 1:] if '/' in file else file,
                            "count":
                            -1
                        }
                        results.append(row)

                return results
            else:
                return [{
                    "db": self.datasource.url,
                    "document": self.datasource.url,
                    "count": -1
                }]
        else:
            return results

    def get_documents(self, filename, limit=10):
        results = []
        delimiter = "\n"
        header = None
        rows = 0
        if self.datasource.dstype == DataSourceType.SPARK_CSV or \
                self.datasource.dstype == DataSourceType.SPARK_TSV:
            delimiter = "\n"
            with self.client.read(filename,
                                  encoding='utf-8',
                                  delimiter=delimiter) as reader:
                for line in reader:
                    if len(line.strip()) == 0 or line[0] == '#':
                        continue
                    if filename[-3:] == "csv":
                        line = line.split(',')
                    else:
                        line = line.split('\t')

                    if header is None:
                        header = line
                        continue
                    res = {
                        header[i]: line[i]
                        for i in range(len(line)) if i < len(header)
                    }
                    results.append(res)
                    rows += 1
                    if rows > limit + 1:
                        break
        elif self.datasource.dstype == DataSourceType.SPARK_XML:
            with self.client.read(filename, encoding='utf-8',
                                  chunk_size=2048) as reader:
                header = ['content']
                for chunk in reader:
                    res = {'content': str(chunk)}
                    results.append(res)
                    print(results)
                    break
        elif self.datasource.dstype == DataSourceType.SPARK_JSON:
            with self.client.read(filename, encoding='utf-8') as reader:
                model = load(reader)
                if isinstance(model, list):
                    model = [{
                        p:
                        str(list(md[p][0].keys())) if isinstance(md[p], list)
                        and isinstance(md[p][0], dict) else str(model[p])
                        if isinstance(md[p], list) else str(list(md[p].keys()))
                        if isinstance(md[p], dict) else md[p]
                        for p in md
                    } for md in model]
                    results.extend(model)
                else:
                    model = {
                        p: str(list(model[p][0].keys()))
                        if isinstance(model[p], list) and isinstance(
                            model[p][0], dict) else model[p] if isinstance(
                                model[p], list) else str(list(model[p].keys()))
                        if isinstance(model[p], dict) else model[p]
                        for p in model
                    }
                    results.append(model)

        return results[:limit], limit
Пример #30
0
class PendingWindow(object):
    """docstring for PendingWindow"""
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0

    def append(self, tuple_):
        """Make an output tuple persistent, and complete a version if necessary
        """

        self.hdfs_client.write(self.current_backup_path,
                               data=pickle.dumps(tuple_),
                               append=True)

        if isinstance(tuple_, BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuple_.version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuple_.version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def extend(self, tuples):
        # TODO: can be improved
        with self.hdfs_client.write(self.current_backup_path,
                                    append=True) as f:
            for t in tuples:
                pickle.dump(t, f)

        if isinstance(tuples[-1], BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuples[-1].version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuples[-1].version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def truncate(self, version):
        """Delete files with filename <= version
        """
        # with self.hdfs_client.read(self.safe_version_path) as f:
        #     safe_version = int(f.read())
        #
        # # only = condition can occur
        # if version <= safe_version:
        #     return

        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) <= version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        # self.node.LOGGER.info('truncated version %d' % version)

    def handle_version_ack(self, version_ack):
        old_safe_version = min(self.version_acks.values())
        self.version_acks[version_ack.sent_from] = version_ack.version
        new_safe_version = min(self.version_acks.values())

        if new_safe_version > old_safe_version:
            self.hdfs_client.write(self.safe_version_path,
                                   data=str(new_safe_version),
                                   overwrite=True)
            self.truncate(new_safe_version)

    def get_latest_version(self):
        with self.hdfs_client.read(self.latest_version_path) as f:
            latest_version = int(f.read())
        return latest_version

    def rewind(self, version=None):
        """Delete files with filename > version (including current file)
        """

        if version == None:
            self.hdfs_client.write(self.current_backup_path,
                                   data='',
                                   overwrite=True)
            return

        # TODO: underflow
        # assert version == 0 or
        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) > version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        self.hdfs_client.write(self.current_backup_path,
                               data='',
                               overwrite=True)

        self.hdfs_client.write(self.latest_version_path,
                               data=str(version),
                               overwrite=True)

    def replay(self):
        """When both the node and pending window state are ready, replay the pending window before resuming
        """

        for v in sorted(
                map(
                    int,
                    filter(unicode.isdigit,
                           self.hdfs_client.list(self.backup_dir)))):
            # filter out the faster nodes
            tuples = []
            with self.hdfs_client.read(os.path.join(self.backup_dir,
                                                    str(v))) as f:
                while True:
                    try:
                        t = pickle.load(f)
                        tuples.append(t)
                    except EOFError:
                        self.node.LOGGER.debug(
                            'reached EOF, send this version')
                        break
                    # Spout needs version too, so that data source can resend from a version
                    # except pickle.UnpickleableError:
                    #     self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version')
                    #     break
                self.node.multicast(self.node.downstream_nodes, tuples)
Пример #31
0
 def __init__(self, datasource):
     self.datasource = datasource
     self.client = Config().get_client("dev")