示例#1
0
文件: hdfs.py 项目: qmac/grabbit
class HDFSLayout(Layout):
    def __init__(self,
                 path,
                 config=None,
                 dynamic_getters=False,
                 absolute_paths=True,
                 regex_search=False):
        """
        A container for all the files and metadata found at the specified path.
        Args:
            path (str): The root path of the layout.
            config (str): The path to the JSON config file that defines the
            entities and paths for the current layout.
            dynamic_getters (bool): If True, a get_{entity_name}() method will
                be dynamically added to the Layout every time a new Entity is
                created. This is implemented by creating a partial function of
                the get() function that sets the target argument to the
                entity name.
            absolute_paths (bool): If True, grabbit uses absolute file paths
                everywhere (including when returning query results). If False,
                the input path will determine the behavior (i.e., relative if
                a relative path was passed, absolute if an absolute path was
                passed).
            regex_search (bool): Whether to require exact matching (True)
                or regex search (False, default) when comparing the query
                string to each entity in .get() calls. This sets a default for
                the instance, but can be overridden in individual .get()
                requests.
        """
        self._hdfs_client = Config().get_client()

        path = abspath(path) if absolute_paths and self._hdfs_client is None \
            else path

        # Preprocess the config file
        if isinstance(config, six.string_types):
            config = '/'.join(config.strip('hdfs://').split('/')[1:])
            config = config.replace(self._hdfs_client.root[1:], '')
            with self._hdfs_client.read(config) as reader:
                config = json.load(reader)

        super(HDFSLayout, self).__init__(path, config, dynamic_getters,
                                         absolute_paths, regex_search)

    def _get_files(self):
        self.root = '/'.join(
            self.root.strip('hdfs://').split('/')[1:]).replace(
                self._hdfs_client.root[1:], '')
        return self._hdfs_client.walk(self.root)

    def _make_file_object(self, root, f):
        filepath = str(psp.join(root, f))
        with self._hdfs_client.read(filepath):
            return File(filepath)
示例#2
0
def main():
    client = Config(path=hdfscliconf).get_client()
    with client.read('/user/orenault/passwd') as input:
        #print input.read()
        df = pd.read_csv(input, sep=':', header=None)
        cols = df.iloc[:, 0]
        client.write('/user/orenault/data.avro',
                     cols.to_csv(sep=":", header=True, index=False),
                     overwrite=True)
	def read(cls, file_path):
		lines = []

		try:
			client = Config().get_client('dev')

			with client.read(file_path, encoding='utf-8', delimiter='\n') as reader:
				for line in reader:
					lines.append(line)  # eventuell unnoetig, kann man auch reader zurueckgeben?
		except:
			print("ERROR: Could not read from HDFS.")
			raise

		return lines
def main():
  arg = parsing_options()
  client = Config().get_client()
  with client.read(arg.input) as inputFile:
    # Load file in dataframe
    df=pd.read_csv(inputFile, sep=arg.delimiter, header=arg.header)
  inputFile.closed

  # Open output file
  with client.write(arg.output, overwrite=arg.overwrite) as outputFile:
    
    # Flatten the list of columns
    column = list(itertools.chain.from_iterable(arg.column))
    # open RSA key
    key = get_key(arg.RSAkey,arg.operation)

    # Extract columns which need to be hashed / encrypted
    cols = df.iloc[:,column]
    colName = cols.columns

    if arg.operation == 'decrypt':
      # Do not forget the comma behind the privateRSA
      # the correct python grammer for a singleton tuple is (1,) not (1), 
      # which is just an expr wth the value 1.
      df[colName]=df[colName].apply(decrypt, args=(key,), axis=1)
      df.to_csv(outputFile, sep=":", header=True, index=False)
    else:
      # Encrypt then hash - as otherwise we encrypt the hash value
      # Call function encrypt w/ RSAkey - Axis=1 for row
      encrypted = df[colName].apply(encrypt, args=(key,))#, axis=1)

      # Rename header to not clash when merging df + encrypted data frame
      new_column=[]
      #for i in cols.columns:
      for i in colName:
        new_column.append(str(i) + '_ENC')
      encrypted.columns = new_column
      
      # Concatenate both dataframe
      df = pd.concat([df, encrypted], axis=1)

      # Generate a hash
      df[colName] = df[colName].apply(hash_value).values
      
      # Write to file
      df.to_csv(outputFile, sep=":", header=True, index=False)
示例#5
0
class PendingWindow(object):
    """docstring for PendingWindow"""
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0

    def append(self, tuple_):
        """Make an output tuple persistent, and complete a version if necessary
        """

        self.hdfs_client.write(self.current_backup_path,
                               data=pickle.dumps(tuple_),
                               append=True)

        if isinstance(tuple_, BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuple_.version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuple_.version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def extend(self, tuples):
        # TODO: can be improved
        with self.hdfs_client.write(self.current_backup_path,
                                    append=True) as f:
            for t in tuples:
                pickle.dump(t, f)

        if isinstance(tuples[-1], BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuples[-1].version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuples[-1].version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def truncate(self, version):
        """Delete files with filename <= version
        """
        # with self.hdfs_client.read(self.safe_version_path) as f:
        #     safe_version = int(f.read())
        #
        # # only = condition can occur
        # if version <= safe_version:
        #     return

        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) <= version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        # self.node.LOGGER.info('truncated version %d' % version)

    def handle_version_ack(self, version_ack):
        old_safe_version = min(self.version_acks.values())
        self.version_acks[version_ack.sent_from] = version_ack.version
        new_safe_version = min(self.version_acks.values())

        if new_safe_version > old_safe_version:
            self.hdfs_client.write(self.safe_version_path,
                                   data=str(new_safe_version),
                                   overwrite=True)
            self.truncate(new_safe_version)

    def get_latest_version(self):
        with self.hdfs_client.read(self.latest_version_path) as f:
            latest_version = int(f.read())
        return latest_version

    def rewind(self, version=None):
        """Delete files with filename > version (including current file)
        """

        if version == None:
            self.hdfs_client.write(self.current_backup_path,
                                   data='',
                                   overwrite=True)
            return

        # TODO: underflow
        # assert version == 0 or
        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) > version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        self.hdfs_client.write(self.current_backup_path,
                               data='',
                               overwrite=True)

        self.hdfs_client.write(self.latest_version_path,
                               data=str(version),
                               overwrite=True)

    def replay(self):
        """When both the node and pending window state are ready, replay the pending window before resuming
        """

        for v in sorted(
                map(
                    int,
                    filter(unicode.isdigit,
                           self.hdfs_client.list(self.backup_dir)))):
            # filter out the faster nodes
            tuples = []
            with self.hdfs_client.read(os.path.join(self.backup_dir,
                                                    str(v))) as f:
                while True:
                    try:
                        t = pickle.load(f)
                        tuples.append(t)
                    except EOFError:
                        self.node.LOGGER.debug(
                            'reached EOF, send this version')
                        break
                    # Spout needs version too, so that data source can resend from a version
                    # except pickle.UnpickleableError:
                    #     self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version')
                    #     break
                self.node.multicast(self.node.downstream_nodes, tuples)
示例#6
0
#! /user/bin/env python3

from hdfs import Config
import sys

client = Config().get_client()
filename = sys.argv[1]
with client.read(filename) as reader:
    ans = reader.read(10)
    print(ans.decode())
示例#7
0
  'first_feature': 2.,
  'second_feature': 12.,
}

# First, we delete any existing `models/` folder on HDFS.
client.delete('models', recursive=True)

# We can now upload the data, first as CSV.
with client.write('models/1.csv', encoding='utf-8') as writer:
  for item in model.items():
    writer.write(u'%s,%s\n' % item)

# We can also serialize it to JSON and directly upload it.
with client.write('models/1.json', encoding='utf-8') as writer:
  dump(model, writer)

# We can check that the files exist and get their properties.
assert client.list('models') == ['1.csv', '1.json']
status = client.status('models/1.csv')
content = client.content('models/1.json')

# Later, we can download the files back. The `delimiter` option makes it
# convenient to read CSV files.
with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:
  items = (line.split(',') for line in reader if line)
  assert dict((name, float(value)) for name, value in items) == model

# Loading JSON directly from HDFS is even simpler.
with client.read('models/1.json', encoding='utf-8') as reader:
  assert load(reader) == model
示例#8
0
class SparkHDFSClient(object):
    def __init__(self, datasource):
        self.datasource = datasource
        self.client = Config().get_client("dev")

    def get_file_list(self, folder):
        files = self.client.list(folder.strip())
        files = [folder + '/' + file for file in files]
        return files

    def list_collections(self):
        results = []
        status = self.client.status(self.datasource.url, strict=False)
        print(status, self.datasource.url)
        if status is not None:
            if status['type'] == "DIRECTORY":
                files = self.get_file_list(self.datasource.url)
                while len(files) > 0:
                    file = files.pop()
                    status = self.client.status(os.path.join(
                        self.datasource.url, file),
                                                strict=False)
                    if status is None:
                        continue
                    if status['type'] == "DIRECTORY":
                        subfiles = self.get_file_list(
                            os.path.join(self.datasource.url, file))
                        files.extend(subfiles)
                        continue
                    else:
                        if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \
                                or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\
                                or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\
                                or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json':
                            continue
                        row = {
                            "db":
                            file[:file.rfind('/')]
                            if '/' in file else self.datasource.url,
                            "document":
                            file[file.rfind('/') +
                                 1:] if '/' in file else file,
                            "count":
                            -1
                        }
                        results.append(row)

                return results
            else:
                return [{
                    "db": self.datasource.url,
                    "document": self.datasource.url,
                    "count": -1
                }]
        else:
            return results

    def get_documents(self, filename, limit=10):
        results = []
        delimiter = "\n"
        header = None
        rows = 0
        if self.datasource.dstype == DataSourceType.SPARK_CSV or \
                self.datasource.dstype == DataSourceType.SPARK_TSV:
            delimiter = "\n"
            with self.client.read(filename,
                                  encoding='utf-8',
                                  delimiter=delimiter) as reader:
                for line in reader:
                    if len(line.strip()) == 0 or line[0] == '#':
                        continue
                    if filename[-3:] == "csv":
                        line = line.split(',')
                    else:
                        line = line.split('\t')

                    if header is None:
                        header = line
                        continue
                    res = {
                        header[i]: line[i]
                        for i in range(len(line)) if i < len(header)
                    }
                    results.append(res)
                    rows += 1
                    if rows > limit + 1:
                        break
        elif self.datasource.dstype == DataSourceType.SPARK_XML:
            with self.client.read(filename, encoding='utf-8',
                                  chunk_size=2048) as reader:
                header = ['content']
                for chunk in reader:
                    res = {'content': str(chunk)}
                    results.append(res)
                    print(results)
                    break
        elif self.datasource.dstype == DataSourceType.SPARK_JSON:
            with self.client.read(filename, encoding='utf-8') as reader:
                model = load(reader)
                if isinstance(model, list):
                    model = [{
                        p:
                        str(list(md[p][0].keys())) if isinstance(md[p], list)
                        and isinstance(md[p][0], dict) else str(model[p])
                        if isinstance(md[p], list) else str(list(md[p].keys()))
                        if isinstance(md[p], dict) else md[p]
                        for p in md
                    } for md in model]
                    results.extend(model)
                else:
                    model = {
                        p: str(list(model[p][0].keys()))
                        if isinstance(model[p], list) and isinstance(
                            model[p][0], dict) else model[p] if isinstance(
                                model[p], list) else str(list(model[p].keys()))
                        if isinstance(model[p], dict) else model[p]
                        for p in model
                    }
                    results.append(model)

        return results[:limit], limit
示例#9
0
class HadoopWebExplorer:
    def __init__(self, debug=False):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.hdfscli.cfg')
        self.client = Config(path).get_client()
        self.debug = debug

    def print(self, *args):
        if self.debug:
            print(*args)

    def path_exists(self, path):
        """
        Checks whether such path already exists
        :param path: path to check
        :type path: unicode
        :return: boolean flag indicating whether path already exists or not
        :rtype: bool
        """
        return self.client.status(path, strict=False) is not None

    @catch_hdfs_error
    def create_folder(self, folder_name):
        """
        Creates folder with the given name if it does not exist
        :param folder_name: the name of the folder we want to add
        :type folder_name: unicode
        :return: returns true if created folder or it already exists, otherwise false
        :rtype: bool
        """
        if self.path_exists(folder_name):
            print(f'Folder already exists: {folder_name}')
            return True

        self.print(f'Folder does not exist: {folder_name}')
        self.client.makedirs(folder_name)
        self.print(f'Folder created: {folder_name}')

    @catch_hdfs_error
    def write_to_file(self,
                      folder_name,
                      file_name,
                      data,
                      overwrite=False,
                      append=False):
        """
        Writes provided data into file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be written to
        :type file_name: unicode
        :param data: data to be written
        :type data: unicode
        :param overwrite: overwrite any existing file or directory
        :type overwrite: bool
        :param append: append to a file rather than create a new one.
        :type append: bool
        :return: returns true if it successfully wrote the data, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        if append and not self.path_exists(path):
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite)
        else:
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite,
                              append=append)
        self.print("Written data to HDFS file")

    @catch_hdfs_error
    def read_from_file(self, folder_name, file_name):
        """
        Reads from file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be read from
        :type file_name: unicode
        """
        path = os.path.join(folder_name, file_name)
        if not self.path_exists(path):
            self.print(f'File does not exists: {path}')
            return None
        return self.client.read(path)

    @catch_hdfs_error
    def delete_file(self, folder_name, file_name):
        """
        Deletes file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file to be deleted
        :type file_name: unicode
        :return: returns true if it successfully deleted the file, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        return self.client.delete(path)

    @catch_hdfs_error
    def delete_folder(self, folder_name):
        """
        Deletes the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :return: returns true if it successfully deleted the folder, otherwise false
        :rtype: bool
        """
        return self.client.delete(folder_name, recursive=True)

    @catch_hdfs_error
    def explore_folder(self, folder_name):
        """
        Explores the specified folder
        :param folder_name: name of the folder to be observed
        :type folder_name: unicode
        """
        if not self.path_exists(folder_name):
            self.print(f'Folder does not exists: {folder_name}')
        self.print(f'Exploring folder: {folder_name}')
        for path, dirs, files in self.client.walk(folder_name, status=True):
            for file in files:
                block_size = file[1]['blockSize']
                size = file[1]['length']
                owner = file[1]['owner']
                self.print(
                    f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}'
                )
class HomuraFS():
    def __init__(self):
        self.client = Config().get_client('dev')
        self.prompt = 'homura_fs $ '
        self.name = None
        self.local_xml = None
        self.hdfs_xml = '.last_sync.xml'
        self.hdfs_loc_xml = None
        self.mount_root = None  #os.getcwd() + '/test'
        self.hdfs_root = '/cs219'
        self.meta = HomuraMeta()
        self.monitor = None
        if sys.platform.startswith('darwin'):
            logging.basicConfig(filename='mylog.log', level=logging.INFO)
            self.monitor = Monitor_Start()

    def shell_loop(self):
        while True:
            cmd = raw_input(self.prompt)

            if cmd == 'sync':
                print "Current devices attached:"
                id_mapping = dict()
                count = 1

                if len(self.monitor.devs) == 0:
                    print "No device attached"
                    continue
                for dev in self.monitor.devs:
                    #print dev
                    devname = dev['Dname']
                    manufacture = dev['Man']
                    hname = dev['Hname']
                    id_mapping[count] = dev
                    print "{}) Dname: {}, Hname: {}, Manufacture: {}.\n".format(
                        count, devname, hname, manufacture)
                    count += 1
                dev_id = int(raw_input("Which device to sync:\n"))

                if dev_id == 0:
                    continue

                if dev_id in id_mapping:
                    #self.name = id_mapping[dev_id]['UID']
                    self.name = ''
                    self.mount_root = id_mapping[dev_id]['Path']
                    self.local_xml = self.mount_root + '/.last_sync.xml'
                    self.hdfs_loc_xml = self.mount_root + '/.cur_hdfs.xml'
                    self.meta.myRootpath = self.mount_root

                    log('Mount root is ' + self.mount_root)
                    log('Device xml file is ' + self.local_xml)
                    log('HDFS xml file is ' + self.hdfs_xml)
                    log('Copy of HDFS xml stored at ' + self.hdfs_loc_xml)
                    log('Syncing files for device ' +
                        id_mapping[dev_id]['Dname'])
                    self.sync_files()
                else:
                    pass

            elif cmd == 'test':
                pass
                #log('Setting up test directory with default config')
                #self.__test()
            elif cmd == 'download':
                pass
            elif cmd == 'quit':
                if self.monitor:
                    Monitor_Stop(self.monitor)
                return

    def download_all(self):
        log('Downloading all files from HDFS to local device')
        try:
            self.create_file(self.mount_root, self.hdfs_root, 1)
            for dir_or_file in os.listdir(self.mount_root + self.hdfs_root):
                if not dir_or_file.startswith('.'):
                    shutil.move(
                        self.mount_root + self.hdfs_root + '/' + dir_or_file,
                        self.mount_root)
            shutil.rmtree(self.mount_root + self.hdfs_root)
        except:
            log('Something went wrog while downloading files')
            try:
                shutil.rmtree(self.mount_root + self.hdfs_root)
            except:
                pass

        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')

    def upload_all(self):
        log('Uploading all files from local device to HDFS')

        for dir_or_file in os.listdir(self.mount_root):
            if not dir_or_file.startswith('.'):
                try:
                    log('Uploading to ' + self.hdfs_root + '/' + dir_or_file)
                    self.client.upload(self.hdfs_root + '/' + dir_or_file,
                                       self.mount_root + '/' + dir_or_file,
                                       n_threads=0)
                except:
                    log('Warning: could not upload')

    def load_HDFS_XML(self):
        log("Attempting to fetch HDFS xml")
        self.update_file(self.hdfs_loc_xml, self.hdfs_xml, 1)
        log("Loading HDFS xml")
        self.meta.loadHDFSXml(self.hdfs_loc_xml)
        os.remove(self.hdfs_loc_xml)

    def sync_files(self):
        # check if we have an old snapshot xml
        if not os.path.isfile(
                self.local_xml
        ):  # snapshot doesn't exist, so download everything
            log("No local snapshot file was found at " + self.local_xml)
            self.meta.Snapshotdoc = self.meta.emptyXml()  # use empty
            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()

            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        else:
            log("Fetching local snapshot xml from " + self.local_xml)
            self.meta.loadSnapshotXml(self.local_xml)

            try:
                # fetch HDFS xml and store locally
                self.load_HDFS_XML()
            except:
                self.meta.HDFSdoc = self.meta.emptyXml()

        self.meta.path2Xml(self.mount_root)
        self.meta.mydoc = self.meta.tempdoc

        #print 'HDFS XML:'
        #self.meta.showHDFSXml()
        #print '---\nSnapshot Xml'
        #self.meta.showSnapshotXml()
        #print '---\nLocal Xml'
        #self.meta.showMyXml()

        # find operations since last sync
        (my_creates, my_deletes, my_modifies, hdfs_creates, hdfs_deletes,
         hdfs_modifies) = self.meta.getOperations()

        root = self.mount_root
        name = self.hdfs_root

        # apply operations on current device
        for path in my_creates:
            if path.endswith('/'):  # path is a folder we want to create
                os.makedirs(root + path)
            else:
                self.create_file(root + path, name + path, 1)
        for path in my_modifies:
            self.update_file(root + path, name + path, 1)
        for path in my_deletes:
            self.delete_file(root + path, 1)

        # apply operations on HDFS
        for path in hdfs_creates:
            if path.endswith('/'):  # path is a folder we want to create
                self.client.makedirs(name + path)
            else:
                self.create_file(root + path, name + path, 0)
        for path in hdfs_modifies:
            self.update_file(root + path, name + path, 0)
        for path in hdfs_deletes:
            self.delete_file(name + path, 0)

        # update last sync for both HDFS and current device
        self.meta.path2Xml(self.mount_root)
        self.meta.saveXml(self.local_xml, Xml='temp')
        self.update_file(self.local_xml, self.hdfs_xml, 0)

        return

    # in this set of functions, when kyuubey = 0, the operation goes
    # from loc to hdfs (i.e. local becomes the "master")
    # when kyuubey = 1, the operation goes from hdfs to loc
    # (i.e. hdfs becomes the "master")
    def create_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:
            log('Creating ' + hdfs_path + ' on HDFS')
            self.client.upload(hdfs_path, loc_path, n_threads=0)
        elif kyuubey == 1:
            log('Creating ' + loc_path + ' locally')
            self.client.download(hdfs_path, loc_path, n_threads=0)

    def update_file(self, loc_path, hdfs_path, kyuubey):
        if kyuubey == 0:  # updating file on HDFS
            log('Updating file ' + hdfs_path + ' on HDFS')
            with open(loc_path) as reader:
                with self.client.write(hdfs_path, overwrite=True) as writer:
                    for line in reader:
                        writer.write(line)
        elif kyuubey == 1:
            log('Updating file ' + loc_path + ' locally')
            with open(loc_path, 'w') as writer:
                with self.client.read(hdfs_path) as reader:
                    data = reader.read()
                    writer.write(data)

    def delete_file(self, path, kyuubey):
        if kyuubey == 0:  # delete file on HDFS
            log('Deleting file ' + path + ' from HDFS')
            self.client.delete(path, recursive=True)
        elif kyuubey == 1:  # delete file locally
            log('Deleting file ' + path + ' locally')
            os.remove(path)

    def move_file(self, src_path, dst_path, kyuubey):
        if kyuubey == 0:  # move file on HDFS
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' on HDFS')
            self.client.rename(src_path, dst_path)
        elif kyuubey == 1:  # move file locally
            os.rename(src_path, dst_path)
            log('Moving file from ' + src_path + ' to ' + dst_path +
                ' locally')

    def __test(self, test_no=1):
        self.__reset_test()
        if test_no == 1:
            self.__config_basic()
        elif test_no == 2:
            self.__config_outer_empty()

    def __reset_test(self):
        root = self.mount_root
        log('Resetting mount directory')
        if os.path.exists(root):
            shutil.rmtree(root)
        os.makedirs(root)

    def __config_basic(self):
        root = self.mount_root
        log('Config 1: default')
        with open(root + '/test1.txt', 'w') as writer:
            writer.write('hi\nthere\n!\n')
        with open(root + '/test2.txt', 'w') as writer:
            writer.write('one-liner')
        with open(root + '/test3.txt', 'w') as writer:
            writer.write('')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')

    def __config_outer_empty(self):
        root = self.mount_root
        log('Config 2: outer directory empty')
        os.makedirs(root + '/subdir')
        with open(root + '/subdir/test1.txt', 'w') as writer:
            writer.write('a different\ntest1.txt\nfile!\n')
        os.makedirs(root + '/subdir/subsubdir')
        with open(root + '/subdir/subsubdir/test1.txt', 'w') as writer:
            writer.write('yet another different\ntest1.txt\nfile!\n')
示例#11
0
def main():
    arg = parsing_options()
    krb_client = Config(path=arg.hdfsConf).get_client()
    az_conf = read_conf(arg.azureConf)
    az_client = az_key_vault_connection(az_conf['azure_client_id'],
                                        az_conf['azure_client_secret'],
                                        az_conf['azure_tenant_id'])
    az_rsa_key = az_get_rsa_key_info(az_client, az_conf['key_vault'],
                                     az_conf['key_name'])
    column = list(itertools.chain.from_iterable(arg.column))
    with krb_client.read(arg.input) as inputFile:
        with krb_client.write(arg.output,
                              overwrite=arg.overwrite) as outputFile:
            if arg.operation == 'encrypt':
                aes_key = generate_aes_key()
                az_conf['uuid'] = str(uuid.uuid4())
                encrypt_and_store_aes_key(az_client, az_conf,
                                          az_rsa_key['version'],
                                          base64.b64encode(aes_key))
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=10000)
                num_chunk = 0
                for chunk in df:
                    # Generate new column name and hash in place
                    new_column = []
                    for i in column:
                        new_column.append(str(i) + '_HASH')
                    chunk[new_column] = chunk[column].apply(hash_value)
                    # Encrypt in place
                    chunk[column] = chunk[column].apply(encrypt,
                                                        args=(aes_key,
                                                              az_conf['uuid']))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)
            else:
                df = pd.read_csv(inputFile,
                                 sep=arg.delimiter,
                                 header=arg.header,
                                 dtype=str,
                                 chunksize=1000)
                num_chunk = 0
                for chunk in df:
                    if num_chunk == 0:
                        # spliting only the first column - grabbing the 3rd field (key) and grabbing the value [0]
                        key = base64.b64decode(chunk[column[0]].str.split(
                            pat='-', n=3, expand=True)[3][0])
                        aes_key = retrieve_and_decrypt_aes_key(
                            az_client, az_conf, az_rsa_key['version'], key)
                    chunk[column] = chunk[column].apply(decrypt,
                                                        args=(aes_key, ))
                    if num_chunk == 0:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=True,
                                     index=False)
                        num_chunk += 1
                    else:
                        chunk.to_csv(outputFile,
                                     sep=arg.delimiter,
                                     header=False,
                                     index=False)