Exemplo n.º 1
0
    def _parse_serializers(self):
        ser_conf = self._get_env_conf(SERIALIZATION_CONF_PICKE_FILE_PATH) or {}

        serializers = {
                'input': get_protocol_from_name(ser_conf.get('input', DEFAULT_INPUT_SERIALIZED)),
                'output': get_protocol_from_name(ser_conf.get('output', DEFAULT_OUTPUT_SERIALIZED)),
                'inter': get_protocol_from_name(ser_conf.get('inter', DEFAULT_INTER_SERIALIZED)),
        }

        self._set_serializers(serializers)
Exemplo n.º 2
0
    def cat(self, path, serializer='raw', tab_separated=False):
        """
        Returns a generator over files defined by path

        :param path: path to the files
        :param serializer: input serializer. Options are json, pickle and raw(default)
        :param tab_seperated: boolean if input is tab separated
        """
        job = self._hdpm._run_hadoop_cmd('fs', ('-cat', path))
        output = job.yield_stdout()
        output_serializer = get_protocol_from_name(serializer)

        for line in output:
            line = line.rstrip()

            if tab_separated:
                ls = line.split('\t')
                if len(ls) > 1:
                    yield tuple(output_serializer.decode(part) for part in line.split('\t'))
                else:
                    yield output_serializer.decode(line)
            else:
                yield output_serializer.decode(line)

        job.join()
Exemplo n.º 3
0
    def cat(self, path, serializer='raw', tab_separated=False):
        """
        Returns a generator over files defined by path

        :param path: path to the files
        :param serializer: input serializer. Options are json, pickle and raw(default)
        :param tab_seperated: boolean if input is tab separated
        """
        job = self._hdpm._run_hadoop_cmd('fs', ('-cat', path))
        output = job.yield_stdout()
        output_serializer = get_protocol_from_name(serializer)

        for line in output:
            line = line.rstrip()

            if tab_separated:
                ls = line.split('\t')
                if len(ls) > 1:
                    yield tuple(
                        output_serializer.decode(part)
                        for part in line.split('\t'))
                else:
                    yield output_serializer.decode(line)
            else:
                yield output_serializer.decode(line)

        job.join()
Exemplo n.º 4
0
    def _parse_serializers(self):
        ser_conf = self._get_env_conf(SERIALIZATION_CONF_PICKE_FILE_PATH) or {}

        serializers = {
            'input':
            get_protocol_from_name(
                ser_conf.get('input', DEFAULT_INPUT_SERIALIZED)),
            'output':
            get_protocol_from_name(
                ser_conf.get('output', DEFAULT_OUTPUT_SERIALIZED)),
            'inter':
            get_protocol_from_name(
                ser_conf.get('inter', DEFAULT_INTER_SERIALIZED)),
        }

        self._set_serializers(serializers)