def upload_distmatrix_to_DS(dist_matrix,
                            feature_type,
                            compound_ids,
                            bucket,
                            title,
                            description,
                            tags,
                            key_values,
                            filepath="./",
                            dataset_key=None):
    """ Uploads distance matrix in the data store with the appropriate tags
    
    Args:
        dist_matrix: dist_matrix computed using
        
        feature_type: How was the data featurized.
        
        dist_met: What distance metric was used.
        
        compound_ids = list of compound ids corresponding to the distance matrix (assumes that distance matrix is square
        and is the distance between all compounds in a dataset)
        
        bucket = bucket the file will be put in
        
        title = title of the file in (human friendly format)
        
        description = long text box to describe file (background/use notes)
        
        tags = must be a list.
        
        key_values = key:value pairs to enable future users to find the file. Must be a dictionary.
        
        filepath = local path where you want to store the pickled dataframe
        
        dataset_key = If updating a file already in the datastore enter the corresponding dataset_key. 
                      If not, leave as 'none' and the dataset_key will be automatically generated.
                      
    Returns:
         None
    """

    dist_df = pd.DataFrame(dist_matrix)
    dist_df.index = compound_ids
    dist_df.columns = compound_ids
    fnm = "distmatrix_nm"
    filename = fn.replace("nm", feature_type)
    dist_pkl = dist_df.to_pickle(filepath + filename)
    dsf.upload_file_to_DS(bucket,
                          title,
                          description,
                          tags,
                          key_values,
                          filepath,
                          filename,
                          dataset_key,
                          client=None)
Пример #2
0
def upload_distmatrix_to_DS(
        dist_matrix,feature_type,compound_ids,bucket,title,description,tags,key_values,filepath="./",dataset_key=None):
    """ Uploads distance matrix in the data store with the appropriate tags
    
    Args:
        dist_matrix (np.ndarray): The distance matrix.
        
        feature_type (str): How the data was featurized.
        
        dist_met (str): What distance metric was used.
        
        compound_ids (list): list of compound ids corresponding to the distance matrix (assumes that distance matrix is square
        and is the distance between all compounds in a dataset)
        
        bucket (str): bucket the file will be put in
        
        title (str): title of the file in (human friendly format)
        
        description (str): long text box to describe file (background/use notes)
        
        tags (list): List of tags to assign to datastore object.
        
        key_values (dict): Dictionary of key:value pairs to include in the datastore object's metadata.
        
        filepath (str): local path where you want to store the pickled dataframe
        
        dataset_key (str): If updating a file already in the datastore enter the corresponding dataset_key.
                      If not, leave as 'none' and the dataset_key will be automatically generated.
                      
    Returns:
         None
    """

    dist_df = pd.DataFrame(dist_matrix)
    dist_df.index = compound_ids
    dist_df.columns = compound_ids
    fnm = "distmatrix_nm"
    filename = fn.replace("nm",feature_type)
    dist_pkl = dist_df.to_pickle(filepath+filename)
    dsf.upload_file_to_DS(bucket, title, description, tags, key_values, filepath, filename, dataset_key, client=None)
def save_model(pipeline, collection_name='model_tracker', log=True):
    """Save the model.

    Save the model files to the datastore and save the model metadata dict to the Mongo database.

    Args:
        pipeline (ModelPipeline object): the pipeline to use
        collection_name (str): the name of the Mongo DB collection to use
        log (bool): True if logs should be printed, default False
        use_personal_client (bool): True if personal client should be used (i.e. for testing), default False

    Returns:
        None if insertion was successful, raises UnableToTarException, DatastoreInsertionException, MLMTClientInstantiationException
        or MongoInsertionException otherwise
    """

    if pipeline is None:
        raise Exception('pipeline cannot be None.')

    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can save models in filesystem only."
        )
        return

    # ModelPipeline.create_model_metadata() should be called before the call to save_model.
    # Get the metadata dictionary from the model pipeline.
    metadata_dict = pipeline.model_metadata
    model_uuid = metadata_dict['model_uuid']
    if model_uuid is None:
        raise ValueError("model_uuid is missing from pipeline metadata.")

    #### Part 1: Save the model tarball ####
    model = pipeline.model_wrapper
    # best_model_dir is an absolute path.
    directory_to_tar = model.best_model_dir
    # Put tar file in a temporary directory that will automatically be destroyed when we're done
    with tempfile.TemporaryDirectory() as tmp_dir:
        tar_file = os.path.join(
            tmp_dir, 'model_{model_uuid}.tar.gz'.format(model_uuid=model_uuid))
        tar_flags = 'czf'
        # Change directory to model_dir so that paths in tarball are relative to model_dir.
        tar_command = 'tar -{tar_flags} {tar_file} -C {directory_to_tar} .'.format(
            tar_flags=tar_flags,
            tar_file=tar_file,
            directory_to_tar=directory_to_tar)
        try:
            subprocess.check_output(tar_command.split())
        except subprocess.CalledProcessError as e:
            pipeline.log.error(
                'Command to create model tarball returned status {return_code}'
                .format(return_code=e.returncode))
            pipeline.log.error('Command was: "{cmd}"'.format(cmd=e.cmd))
            pipeline.log.error(
                'Output was: "{output}"'.format(output=e.output))
            pipeline.log.error(
                'stderr was: "{stderr}"'.format(stderr=e.stderr))
            raise UnableToTarException(
                'Unable to tar {directory_to_tar}.'.format(
                    directory_to_tar=directory_to_tar))
        title = '{model_uuid} model tarball'.format(model_uuid=model_uuid)
        uploaded_results = dsf.upload_file_to_DS(
            bucket=pipeline.params.model_bucket,
            title=title,
            description=title,
            tags=[],
            key_values={
                'model_uuid': model_uuid,
                'file_category': 'ml_model'
            },
            filepath=tmp_dir,
            filename=tar_file,
            dataset_key='model_' + model_uuid + '_tarball',
            client=pipeline.ds_client,
            return_metadata=True)
        if uploaded_results is None:
            raise DatastoreInsertionException(
                'Unable to upload title={title} to datastore.'.format(
                    title=title))
    # Get the dataset_oid for actual metadata file stored in datastore.
    model_dataset_oid = uploaded_results['dataset_oid']
    # By adding dataset_oid to the dict, we can immediately find the datastore file asssociated with a model.
    metadata_dict['model_parameters']['model_dataset_oid'] = model_dataset_oid

    #### Part 2: Save the model metadata ####
    mlmt_client = dsf.initialize_model_tracker()
    mlmt_client.save_metadata(collection_name=collection_name,
                              model_uuid=metadata_dict['model_uuid'],
                              model_metadata=metadata_dict)
    if log:
        print('Successfully inserted into the database with model_uuid %s.' %
              model_uuid)
Пример #4
0
def save_model(pipeline, collection_name='model_tracker', log=True):
    """Save the model.

    Save the model files to the datastore and save the model metadata dict to the Mongo database.

    Args:
        pipeline (ModelPipeline object): the pipeline to use

        collection_name (str): the name of the Mongo DB collection to use

        log (bool): True if logs should be printed, default False

        use_personal_client (bool): True if personal client should be used (i.e. for testing), default False

    Returns:
        None if insertion was successful, raises DatastoreInsertionException, MLMTClientInstantiationException
        or MongoInsertionException otherwise
    """

    if pipeline is None:
        raise Exception('pipeline cannot be None.')

    if not mlmt_supported:
        logger.error(
            "Model tracker not supported in your environment; can save models in filesystem only."
        )
        return

    # ModelPipeline.create_model_metadata() should be called before the call to save_model.
    # Get the metadata dictionary from the model pipeline.
    metadata_dict = pipeline.model_metadata
    model_uuid = metadata_dict['model_uuid']
    if model_uuid is None:
        raise ValueError("model_uuid is missing from pipeline metadata.")

    #### Part 1: Save the model tarball in the datastore ####
    model = pipeline.model_wrapper
    # Put tar file in a temporary directory that will automatically be destroyed when we're done
    with tempfile.TemporaryDirectory() as tmp_dir:
        tarball_path = os.path.join(tmp_dir, f"model_{model_uuid}.tar.gz")
        save_model_tarball(pipeline.params.output_dir, tarball_path)

        title = f"{model_uuid} model tarball"
        ds_key = f"model_{model_uuid}_tarball"
        uploaded_results = dsf.upload_file_to_DS(
            bucket=pipeline.params.model_bucket,
            title=title,
            description=title,
            tags=[],
            key_values={
                'model_uuid': model_uuid,
                'file_category': 'ml_model'
            },
            filepath=tmp_dir,
            filename=tarball_path,
            dataset_key=ds_key,
            client=pipeline.ds_client,
            return_metadata=True)
        if uploaded_results is None:
            raise DatastoreInsertionException(
                'Unable to upload title={title} to datastore.'.format(
                    title=title))
    # Get the dataset_oid for actual metadata file stored in datastore.
    model_dataset_oid = uploaded_results['dataset_oid']
    # By adding dataset_oid to the dict, we can immediately find the datastore file asssociated with a model.
    metadata_dict['model_parameters']['model_dataset_oid'] = model_dataset_oid

    #### Part 2: Save the model metadata in the model tracker ####
    mlmt_client = dsf.initialize_model_tracker()
    mlmt_client.save_metadata(collection_name=collection_name,
                              model_uuid=metadata_dict['model_uuid'],
                              model_metadata=metadata_dict)
    if log:
        logger.info(
            'Successfully inserted into the database with model_uuid %s.' %
            model_uuid)