Exemplo n.º 1
0
  'first_feature': 2.,
  'second_feature': 12.,
}

# First, we delete any existing `models/` folder on HDFS.
client.delete('models', recursive=True)

# We can now upload the data, first as CSV.
with client.write('models/1.csv', encoding='utf-8') as writer:
  for item in model.items():
    writer.write(u'%s,%s\n' % item)

# We can also serialize it to JSON and directly upload it.
with client.write('models/1.json', encoding='utf-8') as writer:
  dump(model, writer)

# We can check that the files exist and get their properties.
assert client.list('models') == ['1.csv', '1.json']
status = client.status('models/1.csv')
content = client.content('models/1.json')

# Later, we can download the files back. The `delimiter` option makes it
# convenient to read CSV files.
with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:
  items = (line.split(',') for line in reader if line)
  assert dict((name, float(value)) for name, value in items) == model

# Loading JSON directly from HDFS is even simpler.
with client.read('models/1.json', encoding='utf-8') as reader:
  assert load(reader) == model
Exemplo n.º 2
0
class SparkHDFSClient(object):
    def __init__(self, datasource):
        self.datasource = datasource
        self.client = Config().get_client("dev")

    def get_file_list(self, folder):
        files = self.client.list(folder.strip())
        files = [folder + '/' + file for file in files]
        return files

    def list_collections(self):
        results = []
        status = self.client.status(self.datasource.url, strict=False)
        print(status, self.datasource.url)
        if status is not None:
            if status['type'] == "DIRECTORY":
                files = self.get_file_list(self.datasource.url)
                while len(files) > 0:
                    file = files.pop()
                    status = self.client.status(os.path.join(
                        self.datasource.url, file),
                                                strict=False)
                    if status is None:
                        continue
                    if status['type'] == "DIRECTORY":
                        subfiles = self.get_file_list(
                            os.path.join(self.datasource.url, file))
                        files.extend(subfiles)
                        continue
                    else:
                        if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \
                                or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\
                                or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\
                                or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json':
                            continue
                        row = {
                            "db":
                            file[:file.rfind('/')]
                            if '/' in file else self.datasource.url,
                            "document":
                            file[file.rfind('/') +
                                 1:] if '/' in file else file,
                            "count":
                            -1
                        }
                        results.append(row)

                return results
            else:
                return [{
                    "db": self.datasource.url,
                    "document": self.datasource.url,
                    "count": -1
                }]
        else:
            return results

    def get_documents(self, filename, limit=10):
        results = []
        delimiter = "\n"
        header = None
        rows = 0
        if self.datasource.dstype == DataSourceType.SPARK_CSV or \
                self.datasource.dstype == DataSourceType.SPARK_TSV:
            delimiter = "\n"
            with self.client.read(filename,
                                  encoding='utf-8',
                                  delimiter=delimiter) as reader:
                for line in reader:
                    if len(line.strip()) == 0 or line[0] == '#':
                        continue
                    if filename[-3:] == "csv":
                        line = line.split(',')
                    else:
                        line = line.split('\t')

                    if header is None:
                        header = line
                        continue
                    res = {
                        header[i]: line[i]
                        for i in range(len(line)) if i < len(header)
                    }
                    results.append(res)
                    rows += 1
                    if rows > limit + 1:
                        break
        elif self.datasource.dstype == DataSourceType.SPARK_XML:
            with self.client.read(filename, encoding='utf-8',
                                  chunk_size=2048) as reader:
                header = ['content']
                for chunk in reader:
                    res = {'content': str(chunk)}
                    results.append(res)
                    print(results)
                    break
        elif self.datasource.dstype == DataSourceType.SPARK_JSON:
            with self.client.read(filename, encoding='utf-8') as reader:
                model = load(reader)
                if isinstance(model, list):
                    model = [{
                        p:
                        str(list(md[p][0].keys())) if isinstance(md[p], list)
                        and isinstance(md[p][0], dict) else str(model[p])
                        if isinstance(md[p], list) else str(list(md[p].keys()))
                        if isinstance(md[p], dict) else md[p]
                        for p in md
                    } for md in model]
                    results.extend(model)
                else:
                    model = {
                        p: str(list(model[p][0].keys()))
                        if isinstance(model[p], list) and isinstance(
                            model[p][0], dict) else model[p] if isinstance(
                                model[p], list) else str(list(model[p].keys()))
                        if isinstance(model[p], dict) else model[p]
                        for p in model
                    }
                    results.append(model)

        return results[:limit], limit
Exemplo n.º 3
0
class HadoopWebExplorer:
    def __init__(self, debug=False):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.hdfscli.cfg')
        self.client = Config(path).get_client()
        self.debug = debug

    def print(self, *args):
        if self.debug:
            print(*args)

    def path_exists(self, path):
        """
        Checks whether such path already exists
        :param path: path to check
        :type path: unicode
        :return: boolean flag indicating whether path already exists or not
        :rtype: bool
        """
        return self.client.status(path, strict=False) is not None

    @catch_hdfs_error
    def create_folder(self, folder_name):
        """
        Creates folder with the given name if it does not exist
        :param folder_name: the name of the folder we want to add
        :type folder_name: unicode
        :return: returns true if created folder or it already exists, otherwise false
        :rtype: bool
        """
        if self.path_exists(folder_name):
            print(f'Folder already exists: {folder_name}')
            return True

        self.print(f'Folder does not exist: {folder_name}')
        self.client.makedirs(folder_name)
        self.print(f'Folder created: {folder_name}')

    @catch_hdfs_error
    def write_to_file(self,
                      folder_name,
                      file_name,
                      data,
                      overwrite=False,
                      append=False):
        """
        Writes provided data into file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be written to
        :type file_name: unicode
        :param data: data to be written
        :type data: unicode
        :param overwrite: overwrite any existing file or directory
        :type overwrite: bool
        :param append: append to a file rather than create a new one.
        :type append: bool
        :return: returns true if it successfully wrote the data, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        if append and not self.path_exists(path):
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite)
        else:
            self.client.write(path,
                              data,
                              encoding='utf-8',
                              overwrite=overwrite,
                              append=append)
        self.print("Written data to HDFS file")

    @catch_hdfs_error
    def read_from_file(self, folder_name, file_name):
        """
        Reads from file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file where data should be read from
        :type file_name: unicode
        """
        path = os.path.join(folder_name, file_name)
        if not self.path_exists(path):
            self.print(f'File does not exists: {path}')
            return None
        return self.client.read(path)

    @catch_hdfs_error
    def delete_file(self, folder_name, file_name):
        """
        Deletes file in the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :param file_name: name of the file to be deleted
        :type file_name: unicode
        :return: returns true if it successfully deleted the file, otherwise false
        :rtype: bool
        """
        path = os.path.join(folder_name, file_name)
        return self.client.delete(path)

    @catch_hdfs_error
    def delete_folder(self, folder_name):
        """
        Deletes the specified folder
        :param folder_name: name of the folder where file is located
        :type folder_name: unicode
        :return: returns true if it successfully deleted the folder, otherwise false
        :rtype: bool
        """
        return self.client.delete(folder_name, recursive=True)

    @catch_hdfs_error
    def explore_folder(self, folder_name):
        """
        Explores the specified folder
        :param folder_name: name of the folder to be observed
        :type folder_name: unicode
        """
        if not self.path_exists(folder_name):
            self.print(f'Folder does not exists: {folder_name}')
        self.print(f'Exploring folder: {folder_name}')
        for path, dirs, files in self.client.walk(folder_name, status=True):
            for file in files:
                block_size = file[1]['blockSize']
                size = file[1]['length']
                owner = file[1]['owner']
                self.print(
                    f'\tFile: {file[0]}, blockSize: {block_size}, size: {size}, owner: {owner}'
                )
Exemplo n.º 4
0
from hdfs import Config, InsecureClient
import cPickle as pickle
from tuple import Tuple

client = Config().get_client('dev')
client.write('a/p', 'aaa', overwrite=True)
print client.status('a')
Exemplo n.º 5
0
from hdfs import Config
from sys import argv
from math import ceil

script, filename = argv

client = Config().get_client()

status = client.status(filename)

print(ceil(status['length'] / status['blockSize']))