Exemplo n.º 1
0
def save_keras_model(limonero_url, token, storage_id, path, keras_model):
    """
    Saves a Keras model with information provided by Limonero.
    :param limonero_url URL for Limonero
    :param token Limonero auth token
    :param storage_id Limonero storage id
    :param path Path where model will be stored
    :param keras_model Model to be saved
    """
    storage = limonero_service.get_storage_info(limonero_url, token,
                                                storage_id)
    if storage.type not in ['HDFS', 'LOCAL']:
        raise ValueError(
            gettext('Unsupported storage type: {}'.format(storage.type)))

    if not path.endswith('.h5'):
        path += '.h5'
    # FIXME: review the path
    final_path = os.path.join(storage.url, 'models', path)

    if storage.type == 'HDFS':
        # Stores the model in a temporary file to copy it to storage
        tmp_file, filename = tempfile.mkstemp()
        keras_model.save(filename)

        # Copy to HDFS
        h = HdfsUtil(storage.url)
        h.copy_from_local(filename, final_path)
    elif storage.type == 'LOCAL':
        keras_model.save(path)
Exemplo n.º 2
0
def load_keras_model(limonero_url, token, storage_id, path):
    """
    Loads a Keras model with information provided by Limonero.
    :param limonero_url URL for Limonero
    :param token Limonero auth token
    :param storage_id Limonero storage id
    :param path Path where model will be stored
    :returns Loaded Keras model
    """
    storage = limonero_service.get_storage_info(limonero_url, token,
                                                storage_id)
    if storage.type not in ['HDFS', 'LOCAL']:
        raise ValueError(
            gettext('Unsupported storage type: {}'.format(storage.type)))

    final_path = os.path.join(storage.url, 'models', path)

    if storage.type == 'HDFS':
        # Stores the model in a temporary file to copy it from storage

        tmp_file, filename = tempfile.mkstemp()
        h = HdfsUtil(storage.url)
        # Requires temp file because Keras do not load from stream :(
        h.copy_to_local(final_path, filename)
        return load_model(filename)

    elif storage.type == 'LOCAL':
        return load_model(path)
Exemplo n.º 3
0
def test_get_storage_info_failure(mocked_get):
    storage_id = 700
    text = {'id': storage_id, 'name': 'Storage HDFS', 'url': 'hdfs://test.com'}
    mocked_get.side_effect = fake_req(201, json.dumps(text))()
    url = 'http://limonero'
    token = '00000'
    with pytest.raises(ValueError):
        resp = limonero_service.get_storage_info(url, token, storage_id)
        mocked_get.assert_called_with(
            'http://limonero/storages/{}'.format(storage_id),
            headers={'X-Auth-Token': '00000'})
        for k, v in resp.items():
            assert v == text[k]
Exemplo n.º 4
0
def perform_copy(config, vallum_ds_id, target_id, path):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    services_config = config.get('juicer').get('services')
    limonero_config = services_config.get('limonero')

    limonero_url = limonero_config.get('url')
    token = str(limonero_config.get('auth_token'))
    vallum_ds = limonero_service.get_data_source_info(limonero_url, token,
                                                      vallum_ds_id)
    vallum_storage = vallum_ds.get('storage', {})
    if vallum_storage.get('type') != 'VALLUM':
        return {'status': 'ERROR', 'message': 'Storage is not VALLUM'}
    target_storage = limonero_service.get_storage_info(limonero_url, token,
                                                       target_id)
    if target_storage.get('type') != 'LOCAL':
        return {
            'status': 'ERROR',
            'message': 'Target storage must be of type LOCAL'
        }

    parsed = urlparse(vallum_storage.get('url'))
    base_url = '{}://{}:{}'.format(parsed.scheme, parsed.hostname, parsed.port
                                   or 80)
    url = base_url + parsed.path
    qs = parse_qs(parsed.query)
    database = qs.get('db', 'samples')[0]

    username = parsed.username
    password = parsed.password
    query = vallum_ds['command']
    mode = 'MN'
    thread = 1

    params = {
        "username": username,
        "password": password,
        "database": database,
        "mode": mode,
        "query": query,
        "thread": thread,
    }
    req = requests.post(url, params, verify=False)
    total = 0
    if req.status_code == 200:
        parsed_local = urlparse(target_storage.get('url'))
        target_dir = parsed_local.path + path  # '/vallum' + str(vallum_ds_id)
        obj = json.loads(req.text)
        for result in obj.get('result'):
            files = result.get('files')
            if files:
                uri_files = [
                    base_url + urlparse(f.get('uri')).path for f in files
                ]
                if not os.path.exists(target_dir):
                    os.makedirs(target_dir)
                for vallum_file in uri_files:
                    file_req = requests.get(vallum_file, params, verify=False)
                    if file_req.status_code == 200:
                        final_filename = target_dir + '/' + \
                                         vallum_file.split('/')[-1]
                        print(final_filename)
                        total += 1
                        with open(final_filename, 'wb') as fout:
                            fout.write(file_req.content)
                    else:
                        raise ValueError('HTTP Status ' + file_req.status_code)
        return total
    else:
        raise ValueError('HTTP Status ' + req.status_code)
Exemplo n.º 5
0
    def generate_code(self):
        # Retrieve Storage URL

        limonero_config = \
            self.parameters['configuration']['juicer']['services']['limonero']
        url = '{}'.format(limonero_config['url'], self.mode)
        token = str(limonero_config['auth_token'])
        storage = limonero_service.get_storage_info(url, token,
                                                    self.storage_id)

        final_url = '{}/limonero/user_data/{}/{}/{}'.format(
            storage['url'], self.user['id'], self.path,
            strip_accents(self.name.replace(' ', '_')))
        code_save = ''
        if self.format == self.FORMAT_CSV:
            code_save = dedent("""
            cols = []
            for attr in {input}.schema:
                if attr.dataType.typeName() in ['array']:
                    cols.append(functions.concat_ws(
                        ', ', {input}[attr.name]).alias(attr.name))
                else:
                    cols.append({input}[attr.name])

            {input} = {input}.select(*cols)
            mode = '{mode}'
            # Write in a temporary directory
            # Header configuration will be handled by LemonadeFileUtil class
            {input}.write.csv('{url}{uuid}',
                         header=False, mode=mode)
            # Merge files using Hadoop HDFS API
            conf = spark_session._jsc.hadoopConfiguration()
            jvm = spark_session._jvm
            fs = jvm.org.apache.hadoop.fs.FileSystem.get(
                jvm.java.net.URI('{storage_url}'), conf)

            path = jvm.org.apache.hadoop.fs.Path('{url}')
            tmp_path = jvm.org.apache.hadoop.fs.Path(
                '{url}{uuid}')
            write_header = {header}
            # org.apache.hadoop.fs.FileUtil do not handle files with header
            header = None
            if write_header:
                header = ','.join([attr.name for attr in {input}.schema])

            fs_util = jvm.br.ufmg.dcc.lemonade.ext.io.LemonadeFileUtil
            if fs.exists(path):
                if mode == 'error':
                    raise ValueError('{error_file_exists}')
                elif mode == 'ignore':
                    emit_event(name='update task',
                        message='{warn_ignored}',
                        status='COMPLETED',
                        identifier='{task_id}')
                elif mode == 'overwrite':
                    fs.delete(path, False)
                    fs_util.copyMergeWithHeader(fs, tmp_path, fs, path, True,
                        conf, header)
                else:
                    raise ValueError('{error_invalid_mode}')
            else:
                fs_util.copyMergeWithHeader(fs, tmp_path, fs, path, True, conf,
                    header)
            """.format(input=self.named_inputs['input data'],
                       url=final_url,
                       header=self.header,
                       mode=self.mode,
                       uuid=uuid.uuid4().hex,
                       storage_url=storage['url'],
                       task_id=self.parameters['task_id'],
                       error_file_exists=_('File already exists'),
                       warn_ignored=_('File not written (already exists)'),
                       error_invalid_mode=_('Invalid mode {}').format(
                           self.mode)))
            # Need to generate an output, even though it is not used.
        elif self.format == self.FORMAT_PARQUET:
            code_save = dedent("""
            {}.write.parquet('{}', mode='{}')""".format(
                self.named_inputs['input data'], final_url, self.mode))
            # Need to generate an output, even though it is not used.
            code_save += '\n{0}_tmp = {0}'.format(
                self.named_inputs['input data'])
        elif self.format == self.FORMAT_JSON:
            code_save = dedent("""
            {}.write.json('{}', mode='{}')""".format(
                self.named_inputs['input data'], final_url, self.mode))

        code = dedent(code_save)

        code_api = """
            # Code to update Limonero metadata information
            from juicer.service.limonero_service import register_datasource
            types_names = {data_types}

            # nullable information is also stored in metadata
            # because Spark ignores this information when loading CSV files
            attributes = []
            decimal_regex = re.compile(r'DecimalType\((\d+),\s*(\d+)\)')
            for att in {input}.schema:
                type_name = str(att.dataType)
                precision = None
                scale = None
                found = decimal_regex.findall(type_name)
                if found:
                    type_name = 'DecimalType'
                    precision = found[0][0]
                    scale = found[0][1]
                attributes.append({{
                  'enumeration': 0,
                  'feature': 0,
                  'label': 0,
                  'name': att.name,
                  'type': types_names[str(type_name)],
                  'nullable': att.nullable,
                  'metadata': att.metadata,
                  'precision': precision,
                  'scale': scale
                }})
            parameters = {{
                'attribute_delimiter': ',',
                'is_first_line_header': write_header,
                'name': "{name}",
                'enabled': 1,
                'is_public': 0,
                'format': "{format}",
                'storage_id': {storage},
                'description': "{description}",
                'user_id': "{user_id}",
                'user_login': "******",
                'user_name': "{user_name}",
                'workflow_id': "{workflow_id}",
                'tags': '{tags}',
                'url': "{final_url}",
                'attributes': attributes
            }}
            register_datasource('{url}', parameters, '{token}', 'overwrite')
            """.format(
            input=self.named_inputs['input data'],
            name=self.name,
            format=self.format,
            storage=self.storage_id,
            description=_('Data source generated by workflow {}').format(
                self.workflow_id),
            user_name=self.user['name'],
            user_id=self.user['id'],
            user_login=self.user['login'],
            workflow_id=self.workflow_id,
            final_url=final_url,
            token=token,
            url=url,
            tags=', '.join(self.tags or []),
            mode=self.mode,
            data_types=json.dumps(self.SPARK_TO_LIMONERO_DATA_TYPES))
        code += dedent(code_api)
        # No return
        code += '{} = None'.format(self.output)

        return code
Exemplo n.º 6
0
    def generate_code(self):

        limonero_config = \
            self.parameters['configuration']['juicer']['services']['limonero']
        url = '{}'.format(limonero_config['url'], self.mode)
        token = str(limonero_config['auth_token'])
        storage = limonero_service.get_storage_info(url, token,
                                                    self.storage_id)

        if storage['type'] != 'HDFS':
            raise ValueError(
                _('Storage type not supported: {}').format(storage['type']))

        if storage['url'].endswith('/'):
            storage['url'] = storage['url'][:-1]
        if self.path.endswith('/'):
            self.path = self.path[:-1]

        if self.path.startswith('/'):
            self.path = self.path[1:]

        final_url = '{}/limonero/user_data/{}/{}/{}'.format(
            storage['url'], self.user['id'], self.path,
            self.name.replace(' ', '_'))

        if self.format == self.FORMAT_CSV and not final_url.endswith('.csv'):
            final_url += '.csv'
        elif self.format == self.FORMAT_JSON and not final_url.endswith(
                '.json'):
            final_url += '.json'
        elif self.format == self.FORMAT_PARQUET and not final_url.endswith(
                '.parquet'):
            final_url += '.parquet'

        df_input = self.named_inputs['input data']
        code_template = """
            path = '{{path}}'
            {%- if scheme == 'hdfs' %}
            fs = pa.hdfs.connect('{{hdfs_server}}', {{hdfs_port}})
            exists = fs.exists(path)
            {%- elif scheme == 'file' %}
            exists = os.path.exists(path)
            {%- endif %}

            mode = '{{mode}}'
            if mode not in ('error', 'ignore', 'overwrite'):
                raise ValueError('{{error_invalid_mode}}')
            if exists:
                if mode == 'error':
                    raise ValueError('{{error_file_exists}}')
                elif mode == 'ignore':
                    emit_event(name='update task',
                        message='{{warn_ignored}}',
                        status='COMPLETED',
                        identifier='{{task_id}}')
                else:
                    {%- if scheme == 'hdfs'%}
                        fs.delete(path, False)
                    {%- elif scheme == 'file' %}
                        os.remove(path)

            parent_dir = os.path.dirname(path)
            if not os.path.exists(parent_dir):
                os.makedirs(parent_dir)
            {%- endif %}
            
            {%- if format == FORMAT_CSV %}
            {%- if scheme == 'hdfs' %}
            from io import StringIO
            with fs.open(path, 'wb') as f:
                s = StringIO()
                {{input}}.to_csv(s, sep=str(','), mode='w',
                header={{header}}, index=False, encoding='utf-8')
                f.write(s.getvalue().encode())               
            {%- elif scheme == 'file' %}
            {{input}}.to_csv(path, sep=str(','), mode='w',
            header={{header}}, index=False, encoding='utf-8')
            {%- endif %}
            
            {%- elif format == FORMAT_PARQUET %}
            {%- if scheme == 'hdfs' %}
            from io import ByteIO
            with fs.open(path, 'wb') as f:
                s = ByteIO()
                {{input}}.to_parquet(s, engine='pyarrow')
                f.write(s.getvalue())               
            {%- elif scheme == 'file' %}
            {{input}}.to_parquet(path, engine='pyarrow')
            {%- endif %}
            
            {%- elif format == FORMAT_JSON %}
            {%- if scheme == 'hdfs' %}
            from io import StringIO
            with fs.open(path, 'wb') as f:
                s = StringIO()
                {{input}}.to_json(s, orient='records')
                f.write(s.getvalue().encode())             
            {%- elif scheme == 'file' %}
            {{input}}.to_json(path, orient='records')
            {%- endif %}
            {%- endif %}
            
            # Code to update Limonero metadata information
            from juicer.service.limonero_service import register_datasource
            types_names = {{data_types}}

            write_header = {{header}}
            attributes = []
            for attr in {{input}}.columns:
                type_name = {{input}}.dtypes[attr]
                precision = None
                scale = None
                attributes.append({
                  'enumeration': 0,
                  'feature': 0,
                  'label': 0,
                  'name': attr,
                  'type': types_names[str(type_name)],
                  'nullable': True,
                  'metadata': None,
                  'precision': precision,
                  'scale': scale
                })
            parameters = {
                'name': "{{name}}",
                'is_first_line_header': write_header,
                'enabled': 1,
                'is_public': 0,
                'format': "{{format}}",
                'storage_id': {{storage}},
                'description': "{{description}}",
                'user_id': "{{user_id}}",
                'user_login': "******",
                'user_name': "{{user_name}}",
                'workflow_id': "{{workflow_id}}",
                'task_id': '{{task_id}}',
                'url': "{{final_url}}",
                'attributes': attributes
            }
            register_datasource('{{url}}', parameters, '{{token}}', 'overwrite')
        """
        parsed = urlparse(final_url)
        template = Environment(loader=BaseLoader).from_string(code_template)
        path = parsed.path

        ctx = dict(
            path=path,
            hdfs_server=parsed.hostname,
            hdfs_port=parsed.port,
            scheme=parsed.scheme,
            name=self.name,
            mode=self.mode,
            storage=self.storage_id,
            description=_('Data source generated by workflow {}').format(
                self.workflow_id),
            workflow_id=self.workflow_id,
            format=self.format,
            header=self.header,
            user_name=self.user['name'],
            user_id=self.user['id'],
            user_login=self.user['login'],
            tags=repr(self.tags),
            FORMAT_CSV=self.FORMAT_CSV,
            FORMAT_PICKLE=self.FORMAT_PICKLE,
            FORMAT_JSON=self.FORMAT_JSON,
            FORMAT_PARQUET=self.FORMAT_PARQUET,
            data_types=json.dumps(self.PANDAS_TO_LIMONERO_DATA_TYPES),
            final_url=final_url,
            input=df_input,
            token=token,
            url=url,
            error_file_exists=_('File already exists'),
            warn_ignored=_('File not written (already exists)'),
            error_invalid_mode=_('Invalid mode {}').format(self.mode),
            uuid=uuid.uuid4().hex,
            storage_url=storage['url'],
            task_id=self.parameters['task_id'],
        )

        return dedent(template.render(ctx))
Exemplo n.º 7
0
    def generate_code(self):
        limonero_config = self.parameters.get('configuration') \
            .get('juicer').get('services').get('limonero')

        url = '{}'.format(limonero_config['url'], self.write_mode)
        token = str(limonero_config['auth_token'])
        storage = limonero_service.get_storage_info(url, token, self.storage_id)

        if storage['type'] != 'HDFS':
            raise ValueError(_('Storage type not supported: {}').format(
                storage['type']))

        if storage['url'].endswith('/'):
            storage['url'] = storage['url'][:-1]

        parsed = urlparse(storage['url'])
        if parsed.scheme == 'file':
            hostname = 'file:///'
            port = 0
        else:
            hostname = parsed.hostname
            port = parsed.port
        models = self.named_inputs['models']
        if not isinstance(models, list):
            models = [models]

        user = self.parameters.get('user', {})
        code = dedent("""
            from juicer.scikit_learn.model_operation import ModelsEvaluationResultList
            from juicer.service.limonero_service import register_model

            all_models = [{models}]
            criteria = '{criteria}'
            if criteria == 'ALL':
                models_to_save = list(itertools.chain.from_iterable(
                    map(lambda m: m.models if isinstance(m,
                        ModelsEvaluationResultList) else [m], all_models)))
            elif criteria == 'BEST':
                raise ValueError('{msg2}')

            import pickle
            from io import BytesIO
            fs = pa.hdfs.connect('{hdfs_server}', {hdfs_port})
            
            def _save_model(model_to_save, model_path, model_name):
                final_model_path = '{final_url}/{{}}'.format(model_path)
                overwrite = '{overwrite}'
                exists = fs.exists(final_model_path)
                if exists:
                    if overwrite == 'OVERWRITE':
                        fs.delete(final_model_path, False)
                    else:
                        raise ValueError('{error_file_exists}')
                        
                with fs.open(final_model_path, 'wb') as f:
                    b = BytesIO()
                    pickle.dump(model_to_save, b)
                    f.write(b.getvalue())
                                
                # Save model information in Limonero
                model_type = '{{}}.{{}}'.format(model_to_save.__module__,
                    model_to_save.__class__.__name__)
                
                model_payload = {{
                    "user_id": {user_id},
                    "user_name": '{user_name}',
                    "user_login": '******',
                    "name": model_name,
                    "class_name": model_type,
                    "storage_id": {storage_id},
                    "path":  model_path,
                    "type": "UNSPECIFIED",
                    "task_id": '{task_id}',
                    "job_id": {job_id},
                    "workflow_id": {workflow_id},
                    "workflow_name": '{workflow_name}'
                }}
                # Save model information in Limonero
                register_model('{url}', model_payload, '{token}')

            for i, model in enumerate(models_to_save):
                if isinstance(model, dict): # For instance, it's a Indexer
                    for k, v in model.items():
                        name = '{name} - {{}}'.format(k)
                        path = '{path}/{name}.{{0}}.{{1:04d}}'.format(k, i)
                        _save_model(v, path, name)
                else:
                    name = '{name}'
                    path = '{path}/{name}.{{0:04d}}'.format(i)
                    _save_model(model, path, name)
        """.format(models=', '.join(models),
                   overwrite=self.write_mode,
                   path=self.path,
                   final_url=storage['url'],
                   url=url,
                   token=token,
                   storage_id=self.storage_id,
                   name=self.filename.replace(' ', '_'),
                   criteria=self.criteria,
                   msg0=_('You cannot mix models with and without '
                          'evaluation (e.g. indexers) when saving models '
                          'and criteria is different from ALL'),
                   msg1=_('You cannot mix models built using with '
                          'different metrics ({}).'),
                   msg2=_('Invalid criteria.'),
                   error_file_exists=_('Model already exists'),
                   job_id=self.job_id,
                   task_id=self.parameters['task_id'],
                   workflow_id=self.workflow_id,
                   workflow_name=self.workflow_name,
                   user_id=user.get('id'),
                   user_name=user.get('name'),
                   user_login=user.get('login'),
                   hdfs_server=hostname,
                   hdfs_port=port,
                   ))
        return code