示例#1
0
 def test_shock_copy_node(self):
     test_phrase = "Hi there!"
     path_to_temp_file = "/kb/module/work/tmp/temp_copy_" + str(
         time.time()) + ".fq"
     self.textToFile(test_phrase, path_to_temp_file)
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                        token=self.ctx['token'])
     attributes = {'foo': 'bar'}
     shock_id = dfu.file_to_shock({
         'file_path': path_to_temp_file,
         'attributes': attributes
     })['shock_id']
     # Check what's saved
     os.remove(path_to_temp_file)
     node_info = dfu.shock_to_file({
         'shock_id': shock_id,
         'file_path': path_to_temp_file
     })
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file))
     self.assertEqual(
         node_info.get('attributes'), attributes,
         "Unexpected attributes in node info: " + str(node_info))
     # Let's copy shock node
     shock_id2 = dfu.copy_shock_node({'shock_id': shock_id})['shock_id']
     path_to_temp_file2 = "/kb/module/work/tmp/temp_copy2_" + str(
         time.time()) + ".fq"
     node_info2 = dfu.shock_to_file({
         'shock_id': shock_id2,
         'file_path': path_to_temp_file2
     })
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
     self.assertEqual(
         node_info2.get('attributes'), attributes,
         "Unexpected attributes in node info: " + str(node_info2))
示例#2
0
def download_genome_to_json_files(token, genome_ref, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    file_name_to_data_map = {}
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                       token=token,
                       service_ver='dev')
    genome_data = dfu.get_objects({'object_refs': [genome_ref]})['data'][0]
    genome_obj = genome_data['data']
    genome_meta = genome_data['info'][10]
    file_name_to_data_map["genome.json"] = genome_obj
    file_name_to_data_map["genome.meta.json"] = genome_meta
    if 'genbank_handle_ref' in genome_obj:
        gbk_file_name = "genome.gbk"
        dfu.shock_to_file({
            'handle_id': genome_obj['genbank_handle_ref'],
            'file_path': os.path.join(target_dir, gbk_file_name)
        })
        genome_obj['genbank_handle_ref'] = gbk_file_name
    if 'contigset_ref' in genome_obj:
        contigset_data = dfu.get_objects(
            {'object_refs': [genome_obj['contigset_ref']]})['data'][0]
        contigset_obj = contigset_data['data']
        contigset_meta = contigset_data['info'][10]
        file_name_to_data_map["contigset.json"] = contigset_obj
        file_name_to_data_map["contigset.meta.json"] = contigset_meta
        genome_obj['contigset_ref'] = "contigset.json"
    elif 'assembly_ref' in genome_obj:
        assembly_data = dfu.get_objects(
            {'object_refs': [genome_obj['assembly_ref']]})['data'][0]
        assembly_obj = assembly_data['data']
        assembly_meta = assembly_data['info'][10]
        file_name_to_data_map["assembly.json"] = assembly_obj
        file_name_to_data_map["assembly.meta.json"] = assembly_meta
        genome_obj['assembly_ref'] = "assembly.json"
        fasta_handle_ref = assembly_obj['fasta_handle_ref']
        fasta_file_name = "assembly.fa"
        dfu.shock_to_file({
            'handle_id':
            fasta_handle_ref,
            'file_path':
            os.path.join(target_dir, fasta_file_name)
        })
        assembly_obj['fasta_handle_ref'] = fasta_file_name
        assembly_obj['external_source_id'] = fasta_file_name
        if 'taxon_ref' in assembly_obj:
            taxon_obj = dfu.get_objects(
                {'object_refs':
                 [assembly_obj['taxon_ref']]})['data'][0]['data']
            file_name_to_data_map["taxon.json"] = taxon_obj
            assembly_obj['taxon_ref'] = "taxon.json"
            if 'taxon_ref' in genome_obj:
                genome_obj['taxon_ref'] = "taxon.json"
            taxon_obj['parent_taxon_ref'] = ""
    for target_file_name in file_name_to_data_map:
        with open(os.path.join(target_dir, target_file_name), 'w') as f:
            json.dump(file_name_to_data_map[target_file_name],
                      f,
                      sort_keys=True,
                      indent=4)
示例#3
0
 def test_shock_handle_ws(self):
     test_phrase = "Hi there!"
     path_to_temp_file = "/kb/module/work/tmp/temp_" + str(
         time.time()) + ".fq"
     self.textToFile(test_phrase, path_to_temp_file)
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                        token=self.ctx['token'])
     uploaded = dfu.file_to_shock({
         'file_path': path_to_temp_file,
         'make_handle': 1
     })
     fhandle = uploaded['handle']
     self.assertTrue('hid' in fhandle, "Handle: " + str(fhandle))
     data = {'hid': fhandle['hid']}
     obj_name = 'TestObject.1'
     info = self.getWsClient().save_objects({
         'workspace':
         self.getWsName(),
         'objects': [{
             'type': 'Empty.AHandle',
             'data': data,
             'name': obj_name
         }]
     })[0]
     self.assertEqual(info[1], obj_name)
     ref = self.getWsName() + '/' + obj_name
     handle_data = self.getWsClient().get_objects([{'ref': ref}])[0]['data']
     self.assertTrue('hid' in handle_data, "Data: " + str(handle_data))
     hid = handle_data['hid']
     path_to_temp_file2 = "/kb/module/work/tmp/temp2_" + str(
         time.time()) + ".fq"
     dfu.shock_to_file({'handle_id': hid, 'file_path': path_to_temp_file2})
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
示例#4
0
    def _get_cached_index(self, assembly_info, validated_params):

        try:
            # note: list_reference_objects does not yet support reference paths, so we need to call
            # with the direct reference.  So we won't get a cache hit if you don't have direct access
            # to the assembly object right now (although you can still always build the assembly object)
            # Once this call supports paths, this should be changed to set ref = assembly_info['ref']
            info = assembly_info['info']
            ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
            objs = self.ws.list_referencing_objects([{'ref': ref}])[0]

            # iterate through each of the objects that reference the assembly
            bowtie2_indexes = []
            for o in objs:
                if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'):
                    bowtie2_indexes.append(o)

            # Nothing refs this assembly, so cache miss
            if len(bowtie2_indexes) == 0:
                return False

            # if there is more than one hit, get the most recent one
            # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that)
            bowtie2_indexes.sort(key=lambda x: x[3])
            bowtie2_index_info = bowtie2_indexes[-1]
            index_ref = str(bowtie2_index_info[6]) + '/' + str(bowtie2_index_info[0]) + '/' + str(bowtie2_index_info[4])

            # get the object data
            index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data']

            # download the handle object
            os.makedirs(validated_params['output_dir'])

            dfu = DataFileUtil(self.callback_url)
            dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'),
                               'handle_id': index_obj_data['handle']['hid'],
                               'unpack': 'unpack'})
            print('Cache hit: ')
            pprint(index_obj_data)
            return {'output_dir': validated_params['output_dir'],
                    'index_files_basename': index_obj_data['index_files_basename']}


        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to lookup in cache:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to lookup in cache.')


        return None
示例#5
0
    def get_genbank_handle(self, data):
        if 'genbank_handle_ref' not in data:
            return None
        if data['genbank_handle_ref'] is None:
            return None

        print('pulling cached genbank file from Shock: ' +
              str(data['genbank_handle_ref']))
        dfu = DataFileUtil(self.cfg.callbackURL)
        file = dfu.shock_to_file({
            'handle_id': data['genbank_handle_ref'],
            'file_path': self.cfg.sharedFolder,
            'unpack': 'unpack'
        })
        return {'genbank_file': {'file_path': file['file_path']}}
示例#6
0
    def get_gff_handle(self, data, output_dir):

        if 'gff_handle_ref' not in data:
            return None
        if data['gff_handle_ref'] is None:
            return None

        print('pulling cached GFF file from Shock: ' +
              str(data['gff_handle_ref']))
        dfu = DataFileUtil(self.cfg.callbackURL)
        file_ret = dfu.shock_to_file({
            'handle_id': data['gff_handle_ref'],
            'file_path': output_dir,
            'unpack': 'unpack'
        })
        return {'file_path': file_ret['file_path']}
示例#7
0
def download_file_from_shock(logger,
                             shock_service_url = None,
                             shock_id = None,
                             filename = None,
                             directory = None,
			     filesize= None,
                             token = None):
    """
    Given a SHOCK instance URL and a SHOCK node id, download the contents of that node
    to a file on disk.
    """

    if filename is not None:
        shockFileName = filename

    if directory is not None:
        filePath = os.path.join(directory, shockFileName)
    else:
        filePath = shockFileName

    #shock_service_url is from config
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token)
    return dfu.shock_to_file({"shock_id" : shock_id, "file_path":filePath, "unpack" : None})
    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and uncompressing if needed. '''

        # construct the input directory where we stage files
        input_directory =  os.path.join(self.cfg.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4()))
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if 'path' in file and file['path'] is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            print('Downloading file from SHOCK node: ' + str(self.cfg.shockURL) + ' - ' + str(file['shock_id']))
            sys.stdout.flush()
            dfUtil = DataFileUtil(self.cfg.callbackURL)
            file_name = dfUtil.shock_to_file({
                                    'file_path': input_directory,
                                    'shock_id': file['shock_id']
                                })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            # Note that the Transform originally had a script_utils.download_from_urls method
            # that, if the url is a folder, pulls all subfiles.  That code recently broke when
            # fetching from NCBI (not clear if it is our issue or NCBI), but for now just
            # support the most common case- an FTP to a single file.
            print('Downloading file from: ' + str(file['ftp_url']))
            sys.stdout.flush()

            url = urlparse(file['ftp_url'])
            if url.scheme != 'ftp' and url.scheme != 'http':
                raise ValueError('Only FTP/HTTP servers are supported')
            file_name = 'genome.gbk'
            if url.path != '':
                file_name = url.path.split('/')[-1]

            req = urllib2.Request(file['ftp_url'])
            response = urllib2.urlopen(req)
            file_data = response.read()

            genbank_file_path = os.path.join(input_directory, file_name)
            with open(genbank_file_path, "w") as genbank_file:
                genbank_file.write(file_data)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            print("staged input file =" + genbank_file_path)
            sys.stdout.flush()
            dfUtil = DataFileUtil(self.cfg.callbackURL)
            dfUtil.unpack_file({ 'file_path': genbank_file_path })

        else:
            raise ValueError('No valid files could be extracted based on the input')

        return input_directory
    def genbank_to_genome_annotation(self, ctx, params):
        """
        :param params: instance of type "GenbankToGenomeAnnotationParams"
           (file_path or shock_id -- Local path or shock_id of the uploaded
           file with genome sequence in GenBank format or zip-file with
           GenBank files. genome_name -- The name you would like to use to
           reference this GenomeAnnotation. If not supplied, will use the
           Taxon Id and the data source to determine the name. taxon_wsname -
           name of the workspace containing the Taxonomy data, defaults to
           'ReferenceTaxons') -> structure: parameter "file_path" of String,
           parameter "shock_id" of String, parameter "ftp_url" of String,
           parameter "genome_name" of String, parameter "workspace_name" of
           String, parameter "source" of String, parameter "taxon_wsname" of
           String, parameter "convert_to_legacy" of type "boolean" (A boolean
           - 0 for false, 1 for true. @range (0, 1))
        :returns: instance of type "GenomeAnnotationDetails" -> structure:
           parameter "genome_annotation_ref" of String
        """
        # ctx is the context object
        # return variables are: details
        #BEGIN genbank_to_genome_annotation

        print('genbank_to_genome_annotation -- paramaters = ')
        pprint(params)

        # validate input and set defaults.  Note that because we don't call the uploader method
        # as a stand alone script, we do the validation here.
        if 'workspace_name' not in params:
            raise ValueError('workspace_name field was not defined')
        workspace_name = params['workspace_name']

        if 'genome_name' not in params:
            raise ValueError('genome_name field was not defined')
        genome_name = params['genome_name']

        source = 'Genbank'
        if 'source' in params:
            source = source;

        taxon_wsname = 'ReferenceTaxons'
        if 'taxon_wsname' in params:
            taxon_wsname = params['taxon_wsname']

        # other options to handle
        # release
        # taxon_reference
        # exclude_feature_types
        # type


        # construct the input directory where we stage files
        input_directory =  os.path.join(self.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4()))
        os.makedirs(input_directory)

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory

        genbank_file_path = None

        if 'file_path' not in params:
            if 'shock_id' not in params:
                if 'ftp_url' not in params:
                    raise ValueError('No input file (either file_path, shock_id, or ftp_url) provided')
                else:
                    # TODO handle ftp - this creates a directory for us, so update the input directory
                    print('calling Transform download utility: script_utils.download');
                    print('URL provided = '+params['ftp_url']);
                    script_utils.download_from_urls(
                            working_directory = input_directory,
                            token = ctx['token'], # not sure why this requires a token to download from a url...
                            urls  = {
                                        'ftpfiles': params['ftp_url']
                                    }
                        );
                    input_directory = os.path.join(input_directory,'ftpfiles')
                    # unpack everything in input directory
                    dir_contents = os.listdir(input_directory)
                    print('downloaded directory listing:')
                    pprint(dir_contents)
                    dir_files = []
                    for f in dir_contents:
                        if os.path.isfile(os.path.join(input_directory, f)):
                            dir_files.append(f)

                    print('processing files in directory...')
                    for f in dir_files:
                        # unpack if needed using the standard transform utility
                        print('unpacking '+f)
                        script_utils.extract_data(filePath=os.path.join(input_directory,f))

            else:
                # handle shock file
                dfUtil = DataFileUtil(self.callback_url, token=ctx['token'])
                file_name = dfUtil.shock_to_file({
                                    'file_path': input_directory,
                                    'shock_id': params['shock_id']
                                })['node_file_name']
                genbank_file_path = os.path.join(input_directory, file_name)
        else:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = params['file_path']
            genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if genbank_file_path is not None:
            print("input genbank file =" + genbank_file_path)

            # unpack if needed using the standard transform utility
            script_utils.extract_data(filePath=genbank_file_path)

        # do the upload (doesn't seem to return any information)
        uploader.upload_genome(
                logger=None,

                shock_service_url = self.shockURL,
                handle_service_url = self.handleURL,
                workspace_service_url = self.workspaceURL,

                input_directory=input_directory,

                workspace_name   = workspace_name,
                core_genome_name = genome_name,
                source           = source,
                taxon_wsname     = taxon_wsname
            )

        #### Code to convert to legacy type if requested
        if 'convert_to_legacy' in params and params['convert_to_legacy']==1:
            from doekbase.data_api.converters import genome as cvt
            print('Converting to legacy type, object={}'.format(genome_name))
            cvt.convert_genome(
                    shock_url=self.shockURL,
                    handle_url=self.handleURL,
                    ws_url=self.workspaceURL,
                    obj_name=genome_name,
                    ws_name=workspace_name)

        # clear the temp directory
        shutil.rmtree(input_directory)

        # get WS metadata to return the reference to the object (could be returned by the uploader method...)
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref':workspace_name + '/' + genome_name}],'includeMetadata':0, 'ignoreErrors':0})[0]

        details = {
            'genome_annotation_ref':str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        }


        #END genbank_to_genome_annotation

        # At some point might do deeper type checking...
        if not isinstance(details, dict):
            raise ValueError('Method genbank_to_genome_annotation return value ' +
                             'details is not type dict as required.')
        # return the results
        return [details]
示例#10
0
class AssemblyToFasta:
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

    def export_as_fasta(self, ctx, params):
        ''' Used almost exclusively for download only '''
        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot export Assembly- not input_ref field defined.')

        # export to a file
        file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']})

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.scratch, file['assembly_name'])
        os.makedirs(export_package_dir)
        shutil.move(
            file['path'],
            os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        return {'shock_id': package_details['shock_id']}

    def assembly_as_fasta(self, ctx, params):
        ''' main function that accepts a ref to an object and writes a file '''

        self.validate_params(params)

        print('downloading ws object data (' + params['ref'] + ')')
        assembly_object = self.dfu.get_objects(
            {'object_refs': [params['ref']]})['data'][0]
        ws_type = assembly_object['info'][2]
        obj_name = assembly_object['info'][1]

        if 'filename' in params:
            output_filename = params['filename']
        else:
            output_filename = obj_name + '.fa'

        output_fasta_file_path = os.path.join(self.scratch, output_filename)

        if 'KBaseGenomes.ContigSet' in ws_type:
            self.process_legacy_contigset(output_fasta_file_path,
                                          assembly_object['data'])
        elif 'KBaseGenomeAnnotations.Assembly' in ws_type:
            self.process_assembly(output_fasta_file_path,
                                  assembly_object['data'])

        else:
            raise ValueError(
                'Cannot write data to fasta; invalid WS type (' + ws_type +
                ').  Supported types are KBaseGenomes.ContigSet and ' +
                'KBaseGenomeAnnotations.Assembly')

        return {'path': output_fasta_file_path, 'assembly_name': obj_name}

    def fasta_rows_generator_from_contigset(self, contig_list):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        for contig in contig_list:
            description = ''
            if 'description' in contig and contig['description']:
                description = contig['description']
            yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet),
                            id=contig['id'],
                            description=description)

    def process_legacy_contigset(self, output_fasta_path, data):
        ''' '''
        SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']),
                    output_fasta_path, "fasta")

    def process_assembly(self, output_fasta_path, data):
        ''' '''
        self.dfu.shock_to_file({
            'handle_id': data['fasta_handle_ref'],
            'file_path': output_fasta_path,
            'unpack': 'uncompress'
        })

    def validate_params(self, params):
        for key in ['ref']:
            if key not in params:
                raise ValueError('required "' + key +
                                 '" field was not defined')
class RNASeqDownloaderUtils:
    def __init__(self, config):
        log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' %
            config)
        self.scratch = config['scratch']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url, token=self.token)
        self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token)

    def download_RNASeq(self, params):
        """
        download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file

        params:
        input_ref: RNASeq object reference ID
        rna_seq_type: one of ['RNASeqAlignment', 
                              'RNASeqExpression', 
                              'RNASeqDifferentialExpression']

        return:
        shock_id: Shock ID of stored zip file
    
        """
        log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq:\nparams: %s'
            % params)

        # Validate params
        self.validate_download_rna_seq_alignment_parameters(params)

        # Download RNASeq zip file
        # RNASeq Alignemnt, Expression and DifferentialExpression
        # has same object_data/handle_data structure
        returnVal = self._download_rna_seq_zip(params.get('input_ref'))

        return returnVal

    def download_RNASeq_Alignment(self, params):
        """
        download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file

        params:
        input_ref: RNASeq object reference ID
        rna_seq_type: 'RNASeqAlignment'
        download_file_type: one of 'bam', 'sam' or 'bai'

        return:
        shock_id: Shock ID of stored zip file
    
        """
        log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq_Alignment:\nparams: %s'
            % params)

        # Validate params
        self.validate_download_rna_seq_alignment_parameters(params)

        input_ref = params.get('input_ref')
        returnVal = dict()

        download_file_type = params.get('download_file_type')
        if download_file_type == 'bam':
            destination_dir = self.rau.download_alignment({
                'source_ref': input_ref,
                'downloadBAI': True
            })['destination_dir']
            shock_id = self._upload_dir_to_shock(destination_dir)
        elif download_file_type == 'sam':
            destination_dir = self.rau.download_alignment({
                'source_ref': input_ref,
                'downloadSAM': True,
                'downloadBAI': True
            })['destination_dir']
            files = os.listdir(destination_dir)
            bam_files = [x for x in files if re.match('.*\.bam', x)]
            for bam_file in bam_files:
                log('removing file: {}'.format(bam_file))
                os.remove(os.path.join(destination_dir, bam_file))
            shock_id = self._upload_dir_to_shock(destination_dir)

        returnVal['shock_id'] = shock_id

        return returnVal

    def validate_download_rna_seq_alignment_parameters(self, params):
        """
        validate_download_rna_seq_alignment_parameters: 
                        validates params passed to download_rna_seq_alignment method
    
        """

        # check required parameters
        for p in ['input_ref', 'rna_seq_type']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        # check supportive RNASeq types
        valid_rnaseq_types = [
            'RNASeqAlignment', 'RNASeqExpression',
            'RNASeqDifferentialExpression'
        ]
        if params['rna_seq_type'] not in valid_rnaseq_types:
            raise ValueError('Unexpected RNASeq type: %s' %
                             params['rna_seq_type'])

    def _download_rna_seq_zip(self, input_ref):
        """
        _download_rna_seq_zip: download RNASeq's archive zip file

        returns:
        shock_id: Shock ID of stored zip file

        """

        # get object data
        object_data = self._get_object_data(input_ref)
        log('---> getting object data\n object_date: %s' %
            json.dumps(object_data, indent=1))

        # get handle data
        handle = self._get_handle_data(object_data)
        log('---> getting handle data\n handle data: %s' %
            json.dumps(object_data, indent=1))

        # make tmp directory for downloading
        dstdir = os.path.join(self.scratch, 'tmp')
        if not os.path.exists(dstdir):
            os.makedirs(dstdir)

        # download original zip file and save to tmp directory
        handle_id = handle.get('hid')
        original_zip_file_path = self._download_original_zip_file(
            handle_id, dstdir)

        log('---> loading %s to shock' % original_zip_file_path)
        shock_id = self._upload_to_shock(original_zip_file_path)

        log('---> removing folder: %s' % dstdir)
        shutil.rmtree(dstdir)

        returnVal = {"shock_id": shock_id}

        return returnVal

    def _get_object_data(self, input_ref):
        """
        _get_object_data: get object_data using DataFileUtil

        """

        get_objects_params = {
            'object_refs': [input_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        return object_data

    def _get_handle_data(self, object_data):
        """
        _get_handle_data: get Handle from object_data

        """

        try:
            handle = object_data.get('data')[0].get('data').get('file')
        except:
            error_msg = "Unexpected object format. Refer to DataFileUtil.get_objects definition\n"
            error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1)
            raise ValueError(error_msg)

        if handle is None:
            error_msg = "object_data does NOT have Handle(file key)\n"
            error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1)
            raise ValueError(error_msg)
        elif handle.get('hid') is None:
            error_msg = "Handle does have NOT HandleId(hid key)\n"
            error_msg += "handle_data:\n%s" % json.dumps(handle, indent=1)
            raise ValueError(error_msg)
        else:
            return handle

    def _download_original_zip_file(self, handle_id, dstdir):
        """
        _download_original_zip_file: download original archive .zip file using DataFileUtil
        
        """

        shock_to_file_params = {'handle_id': handle_id, 'file_path': dstdir}
        original_zip_file = self.dfu.shock_to_file(shock_to_file_params)

        original_zip_file_path = original_zip_file.get('file_path')

        return original_zip_file_path

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
    
        """

        file_to_shock_params = {'file_path': file_path}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id

    def _upload_dir_to_shock(self, directory):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
    
        """

        file_to_shock_params = {'file_path': directory, 'pack': 'zip'}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id
class DifferentialExpressionUtils:
    '''
    Module Name:
    DifferentialExpressionUtils

    Module Description:
    A KBase module: DifferentialExpressionUtils
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/DifferentialExpressionUtils.git"
    GIT_COMMIT_HASH = "76ae39b906473558b32b54acd66385e7474b0115"

    #BEGIN_CLASS_HEADER

    PARAM_IN_SRC_DIR = 'source_dir'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_TOOL_USED = 'tool_used'
    PARAM_IN_TOOL_VER = 'tool_version'
    PARAM_IN_EXPR_SET_REF = 'expressionset_ref'
    PARAM_IN_GENOME_REF = 'genome_ref'
    PARAM_IN_DIFFEXP_FILEPATH = 'diffexpr_filepath'

    def log(self, message, prefix_newline=False):
        print(('\n' if prefix_newline else '') + str(time.time()) + ': ' +
              message)

    def _check_required_param(self, in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name.strip()) or ws_name == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name, int):

            try:
                ws_id = dfu.ws_name_to_id(ws_name)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.log('Obtained workspace name/id ' + str(ws_id))

        return ws_name, ws_id, obj_name_id

    def _proc_upload_diffexpr_params(self, ctx, params):
        """
        Check the presence and validity of upload expression params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_GENOME_REF,
            self.PARAM_IN_TOOL_USED, self.PARAM_IN_TOOL_VER,
            self.PARAM_IN_DIFFEXP_FILEPATH
        ])

        ws_name, ws_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        diffexpr_filepath = params.get(self.PARAM_IN_DIFFEXP_FILEPATH)

        if not (os.path.isfile(diffexpr_filepath)):
            raise ValueError(
                'File {} does not exist: '.format(diffexpr_filepath))

        return ws_name, ws_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.log('Logging workspace exception')
            self.log(str(wse))
            raise
        return info

    def _get_diffexpr_data(self, expressionset_ref):
        """
        Get data from expressionset object required to create
        differential expression object
        """
        expression_set = self.ws_client.get_objects2(
            {'objects': [{
                'ref': expressionset_ref
            }]})['data'][0]

        if not expression_set.get('info')[2].startswith(
                'KBaseRNASeq.RNASeqExpressionSet'):
            raise TypeError(
                '"{}" should be of type KBaseRNASeq.RNASeqExpressionSet'.
                format(self.PARAM_IN_EXPR_SET_REF))

        expression_set_data = expression_set['data']

        diffexpr_data = {}
        diffexpr_data['expressionSet_id'] = expressionset_ref
        diffexpr_data['alignmentSet_id'] = expression_set_data.get(
            'alignmentSet_id')
        diffexpr_data['sampleset_id'] = expression_set_data.get('sampleset_id')
        diffexpr_data['genome_id'] = expression_set_data.get('genome_id')

        condition = []

        mapped_expr_ids = expression_set_data.get('mapped_expression_ids')

        for i in mapped_expr_ids:
            for alignment_id, expression_id in i.items():
                expression_data = self.ws_client.get_objects2(
                    {'objects': [{
                        'ref': expression_id
                    }]})['data'][0]['data']
                expression_condition = expression_data.get('condition')
                if expression_condition not in condition:
                    condition.append(expression_condition)

        diffexpr_data.update({'condition': condition})
        return diffexpr_data

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.demu = GenDiffExprMatrix(config)
        #END_CONSTRUCTOR
        pass

    def upload_differentialExpression(self, ctx, params):
        """
        Uploads the differential expression  *
        :param params: instance of type "UploadDifferentialExpressionParams"
           (*    Required input parameters for uploading Differential
           expression data string   destination_ref        -   object
           reference of Differential expression data. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           string   source_dir             -   directory with the files to be
           uploaded string   expressionset_ref      -   expressionset object
           reference string   tool_used              -   cufflinks, ballgown
           or deseq string   tool_version           -   version of the tool
           used string   diffexpr_filename      -   name of the differential
           expression data file in source_dir, created by cuffdiff, deseq or
           ballgown *) -> structure: parameter "destination_ref" of String,
           parameter "source_dir" of String, parameter "expressionset_ref" of
           String, parameter "tool_used" of String, parameter "tool_version"
           of String, parameter "diffexpr_filename" of String, parameter
           "tool_opts" of mapping from String to String, parameter "comments"
           of String
        :returns: instance of type "UploadDifferentialExpressionOutput" (*   
           Output from upload differential expression    *) -> structure:
           parameter "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_differentialExpression

        self.log(
            'Starting upload differential expression, parsing parameters ')
        pprint(params)

        ws_name, ws_id, obj_name_id = self._proc_upload_diffexpr_params(
            ctx, params)

        # add more to params to pass on to create diff expr matrix

        params['ws_name'] = ws_name
        params['ws_id'] = ws_id
        params['obj_name'] = obj_name_id

        demset_ref = self.demu.gen_diffexpr_matrices(params)

        self.log('Differential Expression Matrix set ref: ')
        pprint(demset_ref)

        returnVal = {'diffExprMatrixSet_ref': demset_ref}

        print('Uploaded object: ')
        print(returnVal)
        #END upload_differentialExpression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method upload_differentialExpression return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_differentialExpression(self, ctx, params):
        """
        Downloads expression *
        :param params: instance of type
           "DownloadDifferentialExpressionParams" (* Required input
           parameters for downloading Differential expression string 
           source_ref   -      object reference of expression source. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id *) -> structure: parameter "source_ref" of String
        :returns: instance of type "DownloadDifferentialExpressionOutput" (* 
           The output of the download method.  *) -> structure: parameter
           "destination_dir" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_differentialExpression

        self.log('Running download_differentialExpression with params:\n' +
                 pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.log('Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'download_' + str(timestamp))
        os.mkdir(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            expression[0]['data']['file']['id'],
            'file_path':
            output_dir,
            'unpack':
            'unpack'
        })

        if not os.listdir(output_dir):
            raise ValueError('No files were downloaded: ' + output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        returnVal = {'destination_dir': output_dir}

        #END download_differentialExpression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method download_differentialExpression return value ' +
                'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_differentialExpression(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download expressions from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting expression string   source_ref         - 
           object reference of Differential expression. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_differentialExpression

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.log('Logging stacktrace from workspace exception:\n' + e.data)
            raise

        output = {'shock_id': expression[0]['data']['file']['id']}

        #END export_differentialExpression

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method export_differentialExpression return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#13
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"


    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print('parsing FASTA file: ' + str(fasta_file_path))
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' +
              str(assembly_data['dna_size']) + 'bp')

        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info


    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        ''' construct the WS object data to save based on the parsed info and params '''
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            assembly_data['taxon_ref'] = params['taxon_ref']

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return assembly_data


    def parse_fasta(self, fasta_file_path, params):
        ''' Do the actual work of inspecting each contig '''

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This fasta file may have amino acids in it instead ' +
                                         'of the required nucleotides.')
                    raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character))

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The fasta header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')
            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list))).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data


    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' +
              str(min_contig_length) + 'bp.')


    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        ''' removes all contigs less than the min_contig_length provided '''
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path


    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info


    def save_fasta_file_to_shock(self, fasta_file_path):
        ''' Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        '''
        print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})


    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and returning the path to the file'''
        file_path = None
        if 'file' in params:
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print('Downloading file from SHOCK node: ' + str(params['shock_id']))
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print('Downloading file from: ' + str(params['ftp_url']))
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid fasta could be extracted based on the input parameters')


    def validate_params(self, params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one fasta file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
示例#14
0
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
class kb_virsorterTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        user_id = requests.post(
            'https://kbase.us/services/authorization/Sessions/Login',
            data='token={}&fields=user_id'.format(token)).json()['user_id']
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_virsorter',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_virsorter'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = kb_virsorter(cls.cfg)

        cls.testobjref = []
        #cls.testobjdata = []
        cls.testwsname = []

    @classmethod
    def tearDownClass(cls):
        if hasattr(cls, 'wsName'):
            cls.wsClient.delete_workspace({'workspace': cls.wsName})
            print('Test workspace was deleted')

        if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0:
            try:
                print('Deleting workspace 2 ' + cls.testwsname[0])
                cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]})
                print('Test workspace 2 was deleted ' + cls.testwsname[0])
            except Exception as e:
                print e

        #if hasattr(cls, 'testobjdata'):
        #    try:
        #        print('Deleting shock data ' + str(len(cls.testobjdata)))
        #        print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0])))
        #        print('Deleting shock data ' + str(cls.testobjdata[0]))
        #        node = cls.testobjdata[0]['data'][0]['lib']['file']['id']
        #        cls.delete_shock_node(node)
        #        print('Test shock data was deleted')
        #    except Exception as e:
        #        print e

    def getWsClient(self):
        return self.__class__.wsClient

    def getWsName(self):
        if hasattr(self.__class__, 'wsName'):
            return self.__class__.wsName
        suffix = int(time.time() * 1000)
        wsName = "test_kb_virsorter_" + str(suffix)
        ret = self.getWsClient().create_workspace({'workspace': wsName})
        self.__class__.wsName = wsName
        return wsName

    def getImpl(self):
        return self.__class__.serviceImpl

    def getContext(self):
        return self.__class__.ctx
    
    
    def write_file(self, filename, content):
        tmp_dir = self.cfg['scratch']
        file_path = os.path.join(tmp_dir, filename)
        with open(file_path, 'w') as fh1:
            fh1.write(content)
        return file_path


    def delete_shock_node(self, node_id):
        header = {'Authorization': 'Oauth {0}'.format(cls.token)}
        requests.delete(cls.shockURL + '/node/' + node_id, headers=header,
                        allow_redirects=True)

    def ztest_aaa_upload_to_shock(self):

        print "upload ref data to shock staging"
        self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        #file_path =  self.write_file('Phage_gene_catalog.tar.gz', 'Test')

        input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz'
        source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name)

        tmp_dir = self.cfg['scratch']
        target_file_path = os.path.join(tmp_dir, input_file_name)

        print "file_path " + source_file_path+"\t"+target_file_path

        orig_size = os.path.getsize(source_file_path)

        shutil.copy(source_file_path, target_file_path)

        print "Testing "+target_file_path
        print(os.path.isfile(target_file_path))

        ret1 = self.dfUtil.file_to_shock(
            {'file_path': target_file_path})
        
        print str(ret1)
        shock_id = ret1['shock_id']
        
        print "shock_id "+shock_id
        file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz')

        #ret2 = self.dfUtil.shock_to_file(
        #    {'shock_id': shock_id, 'file_path': file_path2})[0]
        ret2 = self.dfUtil.shock_to_file(
            {'shock_id': shock_id, 'file_path': file_path2})

        print(ret2)

        file_name = ret2['node_file_name']
        attribs = ret2['attributes']
        self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz')
        self.assertEqual(ret2['file_path'], file_path2)
        self.assertEqual(ret2['size'], orig_size)
        self.assertIsNone(attribs)

        #self.delete_shock_node(shock_id)


    def create_random_string(self):
        N = 20
        return ''.join(
            random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N))

    def test_virsorter_ok(self):
        self.upload_assembly()


        if not self.testwsname:
            self.testwsname.append(self.create_random_string())

        print "upload_reads self.testwsname[0] " + self.testwsname[0]

        #try:
        #    ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  # test_ws_name
        #except Exception as e:
        #    # print "ERROR"
        #    # print(type(e))
        #    # print(e.args)
        #    print(e)
        #    pass

        print "self.testwsname "+ str(self.testwsname)
        params = {}
        params['assembly_ref'] =  str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref
        params['ws_name'] = self.testwsname[0]

        result = self.getImpl().run_virsorter(self.getContext(), params)
        print('RESULT run_virsorter:')
        pprint(result)

        #testresult = [
        #    {'blah': 'blah', 'bleh': 'bleh'}]

        testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}]


        self.assertEqual(sorted(result), sorted(testresult))


    def upload_assembly(self):
        if not self.testobjref:

            print "upload_assembly start"
    
            indata = 'U00096.2.fa'#_first1000.
            ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata)
            print "ftarget " + ftarget
            ret = shutil.copy('../test_data/' + indata, ftarget)
    
            #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

            self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])

            if not self.testwsname:
                self.testwsname.append(self.create_random_string())
    
            print "upload_assembly self.testwsname[0] " + self.testwsname[0]
    
            try:
                ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  #test_ws_name
            except Exception as e:
                #print "ERROR"
                #print(type(e))
                #print(e.args)
                print(e)
                pass
    
            try:
                print "attempt upload"
                print "ftarget " + ftarget
                ref = self.assemblyUtilClient.save_assembly_from_fasta(
                    {
                     'workspace_name': self.testwsname[0],
                     'assembly_name': 'Ecolik12MG1655',
                     'file': {'path': ftarget}})
        
                print "upload_assembly"
                print ref
                #self.testobjref = []
                self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1')
                #self.testobjdata = []
                #self.testobjdata.append(self.dfu.get_objects(
                #    {'object_refs': [self.testobjref[0]]}))
        
                ##print self.testobjdata[0]
    
            except Exception as e:
                print e
                pass
    
            print "self.testobjref[0]"
            print self.testobjref
            print self.testobjref[0]
示例#16
0
class BallgownUtil:

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev')
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']
        self.config = config

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _validate_run_ballgown_app_params(self, params):
        """
        _validate_run_ballgown_app_params:
                validates params passed to run_ballgown_app method
        """

        log('start validating run_ballgown_app params')

        # check for required parameters
        for p in ['expressionset_ref', 'diff_expression_matrix_set_suffix',
                  'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        run_all_combinations = params.get('run_all_combinations')
        condition_pair_subset = params.get('condition_pair_subset')

        if not self._xor(run_all_combinations, condition_pair_subset):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide subset of condition pairs. Don't provide both, or neither."
            raise ValueError(error_msg)

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_html_report(self, result_directory, params, diff_expression_matrix_set_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        for file in glob.glob(os.path.join(result_directory, '*.tsv')):
            shutil.copy(file, output_directory)

        # volcano_plot exists only if there are two condition groups
        for file in glob.glob(os.path.join(result_directory, '*.png')):
            shutil.copy(file, output_directory)

        diff_expr_set = self.ws.get_objects2({'objects':
                                              [{'ref':
                                                diff_expression_matrix_set_ref[
                                                    'diffExprMatrixSet_ref']}]})['data'][0]
        diff_expr_set_data = diff_expr_set['data']
        diff_expr_set_info = diff_expr_set['info']
        diff_expr_set_name = diff_expr_set_info[1]

        overview_content = ''
        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet'
        overview_content += ' Object</th></tr>'
        overview_content += '<tr><td>{} ({})'.format(diff_expr_set_name,
                                                     diff_expression_matrix_set_ref[
                                                         'diffExprMatrixSet_ref'])
        overview_content += '</td></tr></table>'

        overview_content += '<p><br/></p>'

        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix'
        overview_content += ' Object</th><th></th><th></th><th></th></tr>'
        overview_content += '<tr><th>Differential Expression Matrix Name</th>'
        overview_content += '<th>Condition 1</th>'
        overview_content += '<th>Condition 2</th>'
        overview_content += '</tr>'

        for item in diff_expr_set_data['items']:
            item_diffexprmatrix_object = self.ws.get_objects2({'objects':
                                                               [{'ref': item['ref']}]})[
                'data'][0]
            item_diffexprmatrix_info = item_diffexprmatrix_object['info']
            item_diffexprmatrix_data = item_diffexprmatrix_object['data']
            diffexprmatrix_name = item_diffexprmatrix_info[1]

            overview_content += '<tr><td>{} ({})</td>'.format(diffexprmatrix_name,
                                                              item['ref'])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').keys()[0])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').values()[0])
            overview_content += '</tr>'
        overview_content += '</table>'

        # visualization
        image_content = ''
        for image in glob.glob(output_directory + "/*.png"):
            image = image.replace(output_directory + '/', '')
            caption = image.replace(output_directory + '/', '').replace('.png', '')
            image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                             'width="600" height="400"></a><a target="_blank"><br>' \
                             '<p align="center">{}</p></p>'.format(
                                 image, caption)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          overview_content)
                report_template = report_template.replace('<p>Image Gallery</p>',
                                                          image_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Ballgown App'})
        return html_report

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'ballgown_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or
                            file.endswith('.png') or
                            file.endswith('.DS_Store')):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'File(s) generated by Ballgown App'})

        return output_files

    def _generate_report(self, params, result_directory, diff_expression_matrix_set_ref):
        """
        _generate_report: generate summary report
        """
        log('creating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, params, diff_expression_matrix_set_ref)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_ballgown_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def get_sample_dir_group_file(self, mapped_expression_ids, condition_labels):

        ngroups = 0
        group_name_indices = {}
        group_counts = {}

        for group in condition_labels:
            if not group in group_name_indices:
                group_name_indices[group] = ngroups
                ngroups = ngroups + 1
            if not group in group_counts:
                group_counts[group] = 1
            else:
                group_counts[group] = group_counts[group] + 1

        # checks for proper ballgown execution:
        if ngroups < 2:
            raise Exception("At least two condition groups are needed for this analysis. ")
        for group in condition_labels:
            if group_counts[group] < 2:
                raise Exception(
                    "Condition group {0} has less than 2 members; ballgown will not run. "
                    "At least two condition groups are needed for this analysis. ".format(group))

        group_file_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(group_file_dir)

        try:
            condition_labels_uniqued = list(set(condition_labels))
            sgf_name = os.path.join(group_file_dir, 'sample_dir_group_file_' +
                                    condition_labels_uniqued[0] + '_' +
                                    condition_labels_uniqued[1])
            sgf = open(sgf_name, "w")
        except Exception:
            raise Exception(
                "Can't open file {0} for writing {1}".format(
                    sgf_name, traceback.format_exc()))

        index = 0  # condition label index
        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                handle_id = expression_object['data']['file']['hid']
                expression_name = expression_object['info'][1]

                expression_dir = os.path.join(group_file_dir, expression_name)
                self._mkdir_p(expression_dir)

                print('expression_name: ' + str(expression_dir) + ' ' +
                      str(group_name_indices[condition_labels[index]]))
                sgf.write("{0}  {1}\n".format(expression_dir,
                                              group_name_indices[condition_labels[index]]))

                self.dfu.shock_to_file({'handle_id': handle_id,
                                        'file_path': expression_dir,
                                        'unpack': 'unpack'})

                required_files = [
                    'e2t.ctab',
                    'e_data.ctab',
                    'i2t.ctab',
                    'i_data.ctab',
                    't_data.ctab']
                for file in glob.glob(expression_dir + '/*'):
                    if not os.path.basename(file) in required_files:
                        os.remove(file)

            index += 1

        return sgf_name

    def _cleanup(self, directory=None):
        """
        Clean up after the job.  At the moment this just means removing the working
        directory, but later could mean other things.
        """

        try:
            # it would not delete if fold is not empty
            shutil.rmtree(directory, ignore_errors=True)
            # need to iterate each entry
        except IOError as e:
            log("Unable to remove working directory {0}".format(directory))
            raise

    def _setupWorkingDir(self, directory=None):
        """
        Clean up an existing workingdir and create a new one
        """
        try:
            if os.path.exists(directory):
                self._cleanup(directory)
            os.mkdir(directory)
        except IOError:
            log("Unable to setup working dir {0}".format(directory))
            raise

    def _check_intron_measurements(self, sample_dir_group_table_file):
        """
        Check if intron measurements files are non-empty
        :param sample_dir_group_table_file:
        :return:
        """
        log('checking for intron level measurements... ')
        file = open(sample_dir_group_table_file, 'r')
        textFileLines = file.readlines()
        for line in textFileLines:
            expr_dir = line.split()[0]
            log(expr_dir)
            i2t_file = open(os.path.join(expr_dir, 'i2t.ctab'), 'r')
            if len(i2t_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown.")
            idata_file = open(os.path.join(expr_dir, 'i_data.ctab'), 'r')
            if len(idata_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown")

    def run_ballgown_diff_exp(self,
                              rscripts_dir,
                              sample_dir_group_table_file,
                              ballgown_output_dir,
                              output_csv,
                              volcano_plot_file
                              ):
        """ Make R call to execute the system

        :param rscripts_dir:
        :param sample_dir_group_table_file:

        :param ballgown_output_dir:
          sample_group_table is a listing of output Stringtie subdirectories,
         (full path specification) paired with group label (0 or 1), ie
            /path/WT_rep1_stringtie    0
            /path/WT_rep2_stringtie    0
            /path/EXP_rep1_stringtie   1
            /path/EXP_rep2_stringtie   1
          (order doesn't matter, but the directory-group correspondance does)

        :param output_csv:
        :param volcano_plot_file:
        :return:
        """
        # check if intron-level expression measurements are present
        self._check_intron_measurements(sample_dir_group_table_file)

        rcmd_list = ['Rscript', os.path.join(rscripts_dir, 'ballgown_fpkmgenematrix.R'),
                     '--sample_dir_group_table', sample_dir_group_table_file,
                     '--output_dir', ballgown_output_dir,
                     '--output_csvfile', output_csv,
                     '--volcano_plot_file', volcano_plot_file
                     ]
        rcmd_str = " ".join(str(x) for x in rcmd_list)
        log("rcmd_string is {0}".format(rcmd_str))
        openedprocess = subprocess.Popen(rcmd_str, shell=True)
        openedprocess.wait()
        # Make sure the openedprocess.returncode is zero (0)
        if openedprocess.returncode != 0:
            log("R script did not return normally, return code - "
                + str(openedprocess.returncode))
            raise Exception("Rscript failure")

    def load_diff_expr_matrix(self, ballgown_output_dir, output_csv):
        """
        Reads csv diff expr matrix file from Ballgown and returns as a
        dictionary of rows with the gene as key.  Each key gives a row of
        length three corresponding to fold_change, pval and qval in string form
        - can include 'NA'
        :param ballgown_output_dir
        :param output_csv:
        :return:
        """

        diff_matrix_file = os.path.join(ballgown_output_dir, output_csv)

        if not os.path.isfile(diff_matrix_file):
            raise Exception("differential expression matrix csvfile {0} doesn't exist!".format(
                diff_matrix_file))

        n = 0
        dm = {}
        with open(diff_matrix_file, "r") as csv_file:
            csv_rows = csv.reader(csv_file, delimiter="\t", quotechar='"')
            for row in csv_rows:
                n = n + 1
                if (n == 1):
                    if (row != ['id', 'fc', 'pval', 'qval']):
                        raise Exception(
                            "did not get expected column heading from {0}".format(
                                diff_matrix_file))
                else:
                    if (len(row) != 4):
                        raise Exception(
                            "did not get 4 elements in row {0} of csv file {1} ".format(
                                n, diff_matrix_file))
                    key = row[0]
                    # put in checks for NA or numeric for row[1] through 4
                    if (key in dm):
                        raise Exception(
                            "duplicate key {0} in row {1} of csv file {2} ".format(
                                key, n, diff_matrix_file))
                    dm[key] = row[1:5]

        return dm

    def _transform_expression_set_data(self, expression_set_data):
        """
        The stitch to connect KBaseSets.ExpressionSet-2.0 type data to
        the older KBaseRNASeq.RNASeqExpressionSet-3.0 that the implementation
        depends on. This is done by doing a dive into the nested alignment
        object ref and getting the required data
        :param expression_set_data:
        :return: transformed expression_set_data
        """
        transform = dict()
        # get genome id
        expression_ref = expression_set_data['items'][0]['ref']
        wsid, objid, ver = expression_ref.split('/')
        expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        transform['genome_id'] = expression_obj[0]['data']['genome_id']

        # get sampleset_id
        #alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
        #wsid, objid, ver = alignment_ref.split('/')
        #alignment_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        #transform['sampleset_id'] = alignment_obj[0]['data']['sampleset_id']

        # build mapped_expression_ids
        mapped_expression_ids = list()
        for item in expression_set_data['items']:
            expression_ref = item['ref']
            wsid, objid, ver = expression_ref.split('/')
            expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
            alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
            mapped_expression_ids.append({alignment_ref: expression_ref})
        transform['mapped_expression_ids'] = mapped_expression_ids

        return transform

    def _build_condition_label_list(self, mapped_expression_ids):
        """
        Extracts the condition labels from each expression in the specified expression set data
        and builds a list of condition labels
        :param expression_set_data: expression set data
        :return: list of condition labels whose order resembles the expression order in
        the expression data
        """
        condition_labels = list()

        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                condition_labels.append(expression_object['data']['condition'])

        return condition_labels

    def _update_output_file_header(self, output_file):
        """
        Modify header of output file (required by DifferentialExpressionUtils)
        :param output_file:
        :return:
        """
        f = open(output_file, 'r')
        filedata = f.read()
        f.close()

        modified_output = filedata.replace(
            '"id"\t"fc"\t"pval"\t"qval"',
            'gene_id\tlog2_fold_change\tp_value\tq_value')

        f = open(output_file, 'w')
        f.write(modified_output)
        f.close()

    def _check_input_labels(self, condition_pair_subset, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        # example struct: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
        condition_values = set()
        for condition in condition_pair_subset:
            condition_values.add(condition['condition'])

        if len(condition_values) < 2:
            error_msg = 'At least two unique conditions must be specified. '
            raise ValueError(error_msg)

        for condition in condition_pair_subset:

            label = condition['condition'].strip()
            if label not in available_condition_labels:
                error_msg = 'Condition label "{}" is not a valid condition. '.format(label)
                error_msg += 'Must be one of "{}"'.format(available_condition_labels)
                raise ValueError(error_msg)

        return checked

    def run_ballgown_app(self, params):
        """
        run_ballgown_app: run Ballgown app
        (https://www.bioconductor.org/packages/release/bioc/html/ballgown.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            diff_expression_matrix_set_suffix: suffix to KBaseSets.DifferetialExpressionMatrixSet
            name
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            workspace_name: the name of the workspace it gets saved to

        optional params:
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_matrix_set_ref: generated KBaseSets.DifferetialExpressionMatrixSet
            object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning BallgownUtil.run_ballgown_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_ballgown_app_params(params)

        expressionset_ref = params.get('expressionset_ref')

        expression_set_info = self.ws.get_object_info3({
            "objects": [{"ref": expressionset_ref}]})['infos'][0]
        expression_object_type = expression_set_info[2]

        # set output object name
        differential_expression_suffix = params['diff_expression_matrix_set_suffix']
        expression_name = expression_set_info[1]
        if re.match('.*_[Ee]xpression$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression$', differential_expression_suffix, expression_name)
        if re.match('.*_[Ee]xpression_[Ss]et$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression_[Ss]et$', differential_expression_suffix, expression_name)
        else:
            params['diff_expression_matrix_set_name'] = expression_name + \
                differential_expression_suffix

        log('--->\nexpression object type: \n' +
            '{}'.format(expression_object_type))

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

            expression_set_data = self._transform_expression_set_data(expression_set_data)

        mgroup = MultiGroup(self.ws)
        pairwise_mapped_expression_ids = mgroup.build_pairwise_groups(
            expression_set_data['mapped_expression_ids'])

        ballgown_output_dir = os.path.join(self.scratch, "ballgown_out")
        log("ballgown output dir is {0}".format(ballgown_output_dir))
        self._setupWorkingDir(ballgown_output_dir)

        # get set of all condition labels
        available_condition_labels = \
            self._build_condition_label_list(expression_set_data['mapped_expression_ids'])

        if params.get('run_all_combinations'):
            requested_condition_labels = available_condition_labels
        else:
            # get set of user specified condition labels
            condition_pair_subset = params.get('condition_pair_subset')
            if self._check_input_labels(condition_pair_subset, available_condition_labels):
                requested_condition_labels = list()
                # example: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
                for condition in condition_pair_subset:
                    if condition.get('condition').strip() not in requested_condition_labels:
                        requested_condition_labels.append(condition.get('condition').strip())

        log("User requested pairwise combinations from condition label list : " +
            str(requested_condition_labels))

        diff_expr_files = list()

        for mapped_expression_ids in pairwise_mapped_expression_ids:
            print('processing pairwise combination: ')
            pprint(mapped_expression_ids)
            print('with condtion labels: ')
            condition_labels = self._build_condition_label_list(mapped_expression_ids)
            pprint(condition_labels)

            # skip if condition labels in this pairwise combination don't exist in
            # set of user requested condition labels
            skip = False
            for condition in condition_labels:
                if condition not in requested_condition_labels:
                    log("skipping " + str(condition_labels))
                    skip = True
            if skip:
                continue

            sample_dir_group_file = self.get_sample_dir_group_file(mapped_expression_ids,
                                                                   condition_labels)

            log("about to run_ballgown_diff_exp")
            rscripts_dir = '/kb/module/rscripts'

            condition_labels_uniqued = list()
            for condition in condition_labels:
                if condition not in condition_labels_uniqued:
                    condition_labels_uniqued.append(condition)

            output_csv = 'ballgown_diffexp_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.tsv'
            volcano_plot_file = 'volcano_plot_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.png'

            self.run_ballgown_diff_exp(rscripts_dir,
                                       sample_dir_group_file,
                                       ballgown_output_dir,
                                       output_csv,
                                       volcano_plot_file)

            log("back from run_ballgown_diff_exp, about to load diff exp matrix file")
            # diff_expr_matrix = self.load_diff_expr_matrix(ballgown_output_dir,
            # output_csv)  # read file before its zipped

            self._update_output_file_header(os.path.join(ballgown_output_dir, output_csv))

            diff_expr_file = dict()
            diff_expr_file.update({'condition_mapping':
                                   {condition_labels_uniqued[0]: condition_labels_uniqued[1]}})
            diff_expr_file.update(
                {'diffexpr_filepath': os.path.join(ballgown_output_dir, output_csv)})
            diff_expr_files.append(diff_expr_file)

        deu_param = {
            'destination_ref': params['workspace_name'] + '/' +
            params['diff_expression_matrix_set_name'],
            'diffexpr_data': diff_expr_files,
            'tool_used': TOOL_NAME,
            'tool_version': TOOL_VERSION,
            'genome_ref': expression_set_data.get('genome_id'),
        }

        diff_expression_matrix_set_ref = self.deu.save_differential_expression_matrix_set(
            deu_param)

        returnVal = {'result_directory': ballgown_output_dir,
                     'diff_expression_matrix_set_ref':
                         diff_expression_matrix_set_ref['diffExprMatrixSet_ref']}

        report_output = self._generate_report(params,
                                              ballgown_output_dir, diff_expression_matrix_set_ref)
        returnVal.update(report_output)

        return returnVal
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.dfu = DataFileUtil(self.cfg.callbackURL)

    def import_file(self, params):

        # 1) validate parameters
        self._validate_import_file_params(params)

        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)

        # 3) extract out the parameters
        params = self._set_parsed_params(params)

        # 4) do the upload
        result = self.upload_genome(
            shock_service_url=self.cfg.shockURL,
            handle_service_url=self.cfg.handleURL,
            workspace_service_url=self.cfg.workspaceURL,
            callback_url=self.cfg.callbackURL,
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            taxon_wsname=params['taxon_wsname'],
            taxon_reference=params['taxon_reference'],
            source=params['source'],
            genome_type=params['type'],
            release=params['release'])

        # 5) generate report
        output_data_ref = params['workspace_name'] + "/" + params['genome_name']
        reportObj = {
            'objects_created': [{
                'ref': output_data_ref,
                'description': 'KBase Genome object'
            }],
            'text_message':
            result['report_string']
        }

        reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL'])
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        # 6) clear the temp directory
        shutil.rmtree(input_directory)

        # 7) return the result
        info = result['genome_info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return details

    def upload_genome(self,
                      shock_service_url=None,
                      handle_service_url=None,
                      workspace_service_url=None,
                      callback_url=None,
                      input_gff_file=None,
                      input_fasta_file=None,
                      workspace_name=None,
                      core_genome_name=None,
                      scientific_name="unknown_taxon",
                      taxon_wsname='ReferenceTaxons',
                      taxon_reference=None,
                      source=None,
                      release=None,
                      genome_type=None):

        # retrieve taxon
        taxonomy, taxon_reference = self._retrieve_taxon(
            taxon_reference, taxon_wsname, scientific_name)

        # reading in Fasta file
        assembly = self._retrieve_fasta_file(input_fasta_file,
                                             core_genome_name, scientific_name,
                                             source)

        if taxon_reference is not None:
            assembly["taxon_ref"] = taxon_reference

        # reading in GFF file
        feature_list = self._retrieve_gff_file(input_gff_file)

        # compile links between features
        feature_hierarchy = self._generate_feature_hierarchy(feature_list)

        # retrieve genome feature list
        (genome_features_list, genome_mrnas_list,
         genome_cdss_list) = self._retrieve_genome_feature_list(
             feature_list, feature_hierarchy, assembly)

        # remove sequences before loading
        for contig in assembly["contigs"]:
            del assembly["contigs"][contig]["sequence"]

        aUtil = AssemblyUtil(callback_url)
        assembly_ref = aUtil.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file,
                'assembly_name': assembly['assembly_id']
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            assembly['assembly_id']
        })

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, genome_features_list,
                                       genome_cdss_list, genome_mrnas_list,
                                       source, assembly, taxon_reference,
                                       taxonomy, input_gff_file)

        workspace_id = self.dfu.ws_name_to_id(workspace_name)
        genome_info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "name": core_genome_name,
                "type": "KBaseGenomes.Genome",
                "data": genome
            }]
        })[0]
        report_string = ''

        return {'genome_info': genome_info, 'report_string': report_string}

    def _validate_import_file_params(self, params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(file.keys())
                raise ValueError(error_msg)

        # check for valid type param
        valid_types = ['Reference', 'User upload', 'Representative']
        if params.get('type') and params['type'] not in valid_types:
            error_msg = 'Entered value for type is not one of the valid entries of '
            error_msg += '[' + ''.join('"' + str(e) + '", '
                                       for e in valid_types)[0:-2] + ']'
            raise ValueError(error_msg)

    def _set_parsed_params(self, params):
        log('Setting params')

        # default params
        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'type': 'User upload',
            'metadata': {}
        }

        for field in default_params:
            if field not in params:
                params[field] = default_params[field]

        log(json.dumps(params, indent=1))

        return params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                print("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name):
        """
        _retrieve_taxon: retrieve taxonomy and taxon_reference

        """
        taxon_id = -1
        taxon_object_name = "unknown_taxon"

        # retrieve lookup object if scientific name provided
        if (taxon_reference is None
                and scientific_name is not "unknown_taxon"):
            # retrieve taxon lookup object then find taxon id
            taxon_lookup = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/taxon_lookup"],
                'ignore_errors':
                0
            })['data'][0]['data']['taxon_lookup']

            if (scientific_name[0:3] in taxon_lookup
                    and scientific_name in taxon_lookup[scientific_name[0:3]]):
                taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name]
                taxon_object_name = "{}_taxon".format(str(taxon_id))

        # retrieve Taxon object
        taxon_info = {}
        if (taxon_reference is None):
            taxon_info = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/" + taxon_object_name],
                'ignore_errors':
                0
            })['data'][0]
            taxon_reference = "{}/{}/{}".format(taxon_info['info'][6],
                                                taxon_info['info'][0],
                                                taxon_info['info'][4])
        else:
            taxon_info = self.dfu.get_objects({
                "object_refs": [taxon_reference],
                'ignore_errors': 0
            })['data'][0]

        taxonomy = taxon_info['data']['scientific_lineage']

        return taxonomy, taxon_reference

    def _retrieve_fasta_file(self, input_fasta_file, core_genome_name,
                             scientific_name, source):
        """
        _retrieve_fasta_file: retrieve info from fasta_file
                              https://www.biostars.org/p/710/

        """
        log("Reading FASTA file")

        assembly = {
            "contigs": {},
            "dna_size": 0,
            "gc_content": 0,
            "md5": [],
            "base_counts": {}
        }
        contig_seq_start = 0

        input_file_handle = open(input_fasta_file, 'rb')

        # alternate header and sequence
        faiter = (x[1] for x in itertools.groupby(input_file_handle,
                                                  lambda line: line[0] == ">"))
        for header in faiter:
            # drop the ">"
            header = header.next()[1:].strip()
            # join all sequence lines to one.
            seq = "".join(s.strip() for s in faiter.next())

            try:
                fasta_header, fasta_description = header.split(' ', 1)
            except:
                fasta_header = header
                fasta_description = None

            # Handle record
            seq = seq.upper()

            # Build contig objects for Assembly
            seq_count = dict(collections.Counter(seq))

            # to delete at end, but required for now
            contig_dict = {"sequence": seq}

            Ncount = 0
            if "N" in seq_count:
                Ncount = seq_count["N"]
            contig_dict["Ncount"] = Ncount

            for character in seq_count:
                if character in assembly["base_counts"]:
                    assembly["base_counts"][character] += seq_count[character]
                else:
                    assembly["base_counts"][character] = seq_count[character]

            contig_seq_length = len(seq)
            assembly["dna_size"] += contig_seq_length
            contig_gc_length = seq.count("G")
            contig_gc_length += seq.count("C")
            contig_dict["gc_content"] = float("{0:.2f}".format(
                float(contig_gc_length) / float(contig_seq_length)))
            assembly["gc_content"] += contig_gc_length
            contig_dict["contig_id"] = fasta_header
            contig_dict["name"] = fasta_header
            contig_dict["length"] = contig_seq_length
            contig_dict["md5"] = hashlib.md5(seq).hexdigest()
            assembly["md5"].append(contig_dict["md5"])

            if fasta_description is not None:
                contig_dict["description"] = fasta_description

            contig_dict["is_circular"] = "Unknown"
            contig_dict["start_position"] = contig_seq_start
            contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"])
            assembly["contigs"][fasta_header] = contig_dict

            # used for start of next sequence and total gc_content
            contig_seq_start += contig_seq_length

        assembly["gc_content"] = float("{0:.2f}".format(
            float(assembly["gc_content"]) / float(contig_seq_start)))
        assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest()
        assembly["assembly_id"] = core_genome_name + "_assembly"
        assembly["name"] = scientific_name
        assembly["external_source"] = source
        assembly["external_source_id"] = os.path.basename(input_fasta_file)
        assembly["external_source_origination_date"] = str(
            os.stat(input_fasta_file).st_ctime)
        assembly["num_contigs"] = len(assembly["contigs"].keys())
        assembly["type"] = "Unknown"
        assembly[
            "notes"] = "Note MD5s are generated from uppercasing the sequences"

        return assembly

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = dict()
        is_phytozome = 0
        is_patric = 0

        gff_file_handle = open(input_gff_file, 'rb')
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if ("phytozome" in source_id or "Phytozome" in source_id):
                    is_phytozome = 1

                #Checking to see if Phytozome
                if ("PATRIC" in source_id):
                    is_patric = 1

                #PATRIC prepends their contig ids with some gibberish
                if (is_patric and "|" in contig_id):
                    contig_id = contig_id.split("|", 1)[1]

                #Features grouped by contigs first
                if (contig_id not in feature_list):
                    feature_list[contig_id] = list()

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': attributes
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    ftr[key] = value

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if (is_phytozome):
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline
        if (is_phytozome):
            self._print_phytozome_gff(input_gff_file, feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):

        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list.keys():
            for i in range(len(feature_list[contig])):
                if ("ID" not in feature_list[contig][i]):
                    for key in ("transcriptId", "proteinId", "PACid", "pacid",
                                "Parent"):
                        if (key in feature_list[contig][i]):
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i][key]
                            break

                    #If the process fails, throw an error
                    for ftr_type in ("gene", "mRNA", "CDS"):
                        if (ftr_type not in feature_list[contig][i]):
                            continue

                        if ("ID" not in feature_list[contig][i]):
                            log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \
                                    feature_list[contig][i]['contig']+"."+ \
                                    feature_list[contig][i]['source']+"."+ \
                                    feature_list[contig][i]['type']+": "+ \
                                    feature_list[contig][i]['attributes'])
        return feature_list

    def _generate_feature_hierarchy(self, feature_list):

        feature_hierarchy = {contig: {} for contig in feature_list}

        #Need to remember mRNA/gene links for CDSs
        mRNA_gene_dict = {}
        exon_list_position_dict = {}

        for contig in feature_list:
            for i in range(len(feature_list[contig])):
                ftr = feature_list[contig][i]

                if ("gene" in ftr["type"]):
                    feature_hierarchy[contig][ftr["ID"]] = {
                        "utrs": [],
                        "mrnas": [],
                        "cdss": [],
                        "index": i
                    }

                if ("UTR" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[
                        ftr["Parent"]]]["utrs"].append({
                            "id": ftr["ID"],
                            "index": i
                        })

                if ("RNA" in ftr["type"]):
                    feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({
                        "id":
                        ftr["ID"],
                        "index":
                        i,
                        "cdss": []
                    })
                    mRNA_gene_dict[ftr["ID"]] = ftr["Parent"]
                    exon_list_position_dict[ftr["ID"]] = len(
                        feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1

                if ("CDS" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\
                        [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } )

        return feature_hierarchy

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list.keys():
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    def _update_phytozome_features(self, feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list.keys():
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("ID", "PACid", "pacid"):
                    if (key in feature_list[contig][i]):
                        old_id = feature_list[contig][i][key]
                        break
                if (old_id is None):
                    #This should be an error
                    print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\
                               feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("Name" in feature_list[contig][i]):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "Name"]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended
        #4) CDS appended with an incremented digit

        CDS_count_dict = dict()
        mRNA_parent_dict = dict()

        for contig in feature_list.keys():
            for ftr in feature_list[contig]:
                if ("Parent" in ftr):

                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

                    if ("CDS" in ftr["type"]):
                        #Increment CDS identifier
                        if (ftr["ID"] not in CDS_count_dict):
                            CDS_count_dict[ftr["ID"]] = 1
                        else:
                            CDS_count_dict[ftr["ID"]] += 1
                        ftr["ID"] = ftr["ID"] + "." + str(
                            CDS_count_dict[ftr["ID"]])

                        #Recall new mRNA id for parent
                        ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]]

        return feature_list

    def _print_phytozome_gff(self, input_gff_file, feature_list):

        #Write modified feature ids to new file
        input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz"
        try:
            print "Printing to new file: " + input_gff_file
            gff_file_handle = gzip.open(input_gff_file, 'wb')
        except:
            print "Failed to open"

        for contig in sorted(feature_list.iterkeys()):
            for ftr in feature_list[contig]:

                #Re-build attributes
                attributes_dict = {}
                for attribute in ftr["attributes"].split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    if (ftr[key] != value):
                        value = ftr[key]
                    attributes_dict[key] = value

                ftr["attributes"] = ";".join(key + "=" + attributes_dict[key]
                                             for key in attributes_dict.keys())

                new_line = "\t".join(
                    str(ftr[key]) for key in [
                        'contig', 'source', 'type', 'start', 'end', 'score',
                        'strand', 'phase', 'attributes'
                    ])
                gff_file_handle.write(new_line)
        gff_file_handle.close()
        return

    def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy,
                                      assembly):

        genome_features_list = list()
        genome_mrnas_list = list()
        genome_cdss_list = list()
        genome_translation_issues = list()

        for contig in feature_hierarchy:
            for gene in feature_hierarchy[contig]:

                #We only iterate through the gene objects
                #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly

                ftr = feature_list[contig][feature_hierarchy[contig][gene]
                                           ["index"]]
                contig_sequence = assembly["contigs"][
                    ftr["contig"]]["sequence"]
                gene_ftr = self._convert_ftr_object(
                    ftr, contig_sequence
                )  #reverse-complementation for negative strands done here

                #Add non-optional terms
                gene_ftr["mrnas"] = list()
                gene_ftr["cdss"] = list()
                gene_ftr["ontology_terms"] = dict()

                #Retaining longest sequences for gene feature
                longest_protein_length = 0
                longest_protein_sequence = ""
                for mRNA in feature_hierarchy[contig][gene]["mrnas"]:

                    ########################################################
                    # Construct mRNA Ftr
                    ########################################################
                    ftr = feature_list[contig][mRNA["index"]]
                    contig_sequence = assembly["contigs"][
                        ftr["contig"]]["sequence"]
                    mRNA_ftr = self._convert_ftr_object(
                        ftr, contig_sequence
                    )  #reverse-complementation for negative strands done here

                    #Modify mrna object for use in mrna array
                    #Objects will be un-used until further notice
                    mRNA_ftr['parent_gene'] = gene_ftr['id']

                    #If there are CDS, then New CDS ID without incrementation as they were aggregated
                    if (len(mRNA['cdss']) > 0):
                        mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS"
                    else:
                        mRNA_ftr['cds'] = ""

                    #Add to mrnas array
                    genome_mrnas_list.append(mRNA_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["mrnas"].append(mRNA_ftr["id"])

                    ########################################################
                    # Construct transcript, protein sequence, UTR, CDS locations
                    ########################################################

                    #At time of writing, all of this aggregation should probably be done in a single function
                    cds_exons_locations_array = list()
                    cds_cdna_sequence = str()
                    protein_sequence = str()
                    if (len(mRNA["cdss"]) > 0):
                        (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \
                            self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues)

                    UTRs = list()
                    if ("utrs" in feature_hierarchy[contig][gene] and
                            len(feature_hierarchy[contig][gene]["utrs"]) > 0):
                        for UTR in feature_hierarchy[contig][gene]["utrs"]:
                            ftr = feature_list[contig][UTR["index"]]
                            if ("Parent" in ftr
                                    and ftr["Parent"] == mRNA_ftr["id"]):
                                UTRs.append(ftr)

                    mrna_exons_locations_array = copy.deepcopy(
                        cds_exons_locations_array)
                    mrna_transcript_sequence = str(cds_cdna_sequence)
                    if (len(UTRs) > 0):
                        (mrna_exons_locations_array, mrna_transcript_sequence) = \
                            self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence)

                    #Update sequence and locations
                    mRNA_ftr["dna_sequence"] = mrna_transcript_sequence
                    mRNA_ftr["dna_sequence_length"] = len(
                        mrna_transcript_sequence)
                    mRNA_ftr["location"] = mrna_exons_locations_array
                    mRNA_ftr["md5"] = hashlib.md5(
                        mRNA_ftr["dna_sequence"]).hexdigest()

                    #Remove DNA
                    del mRNA_ftr["dna_sequence"]
                    del mRNA_ftr["dna_sequence_length"]

                    #Skip CDS if not present
                    if (len(mRNA["cdss"]) == 0):
                        continue

                    #Remove asterix representing stop codon if present
                    if (len(protein_sequence) > 0
                            and protein_sequence[-1] == '*'):
                        protein_sequence = protein_sequence[:-1]

                    #Save longest sequence
                    if (len(protein_sequence) > longest_protein_length):
                        longest_protein_length = len(protein_sequence)
                        longest_protein_sequence = protein_sequence

                    ########################################################
                    # Construct CDS Ftr
                    ########################################################
                    CDS_ftr = dict()
                    CDS_ftr['type'] = 'CDS'

                    #New CDS ID without incrementation as they were aggregated
                    CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS'

                    #Add gene/mrna links
                    CDS_ftr['parent_gene'] = gene_ftr['id']
                    CDS_ftr['parent_mrna'] = mRNA_ftr['id']

                    #Update sequence and locations
                    CDS_ftr["dna_sequence"] = cds_cdna_sequence
                    CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence)
                    CDS_ftr["location"] = cds_exons_locations_array
                    CDS_ftr["md5"] = hashlib.md5(
                        CDS_ftr["dna_sequence"]).hexdigest()

                    #Add protein
                    CDS_ftr["protein_translation"] = str(
                        protein_sequence).upper()
                    CDS_ftr["protein_translation_length"] = len(
                        CDS_ftr["protein_translation"])
                    #Only generate md5 for dna sequences
                    #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest()

                    #Add empty non-optional fields for populating in future
                    CDS_ftr["ontology_terms"] = dict()
                    if ("aliases" not in CDS_ftr):
                        CDS_ftr["aliases"] = list()
                    if ("function" not in CDS_ftr):
                        CDS_ftr["function"] = ""

                    #Add to cdss array
                    genome_cdss_list.append(CDS_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["cdss"].append(CDS_ftr["id"])

                gene_ftr["protein_translation"] = longest_protein_sequence
                gene_ftr["protein_translation_length"] = longest_protein_length
                genome_features_list.append(gene_ftr)

        msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format(
            len(genome_features_list), len(genome_mrnas_list),
            len(genome_cdss_list))
        msg += "{} mRNA(s) had errors during translation".format(
            len(genome_translation_issues))
        log(msg)

        return genome_features_list, genome_mrnas_list, genome_cdss_list

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         genome_features_list, genome_cdss_list,
                         genome_mrnas_list, source, assembly, taxon_reference,
                         taxonomy, input_gff_file):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome["features"] = genome_features_list
        genome["cdss"] = genome_cdss_list
        genome["mrnas"] = genome_mrnas_list
        genome["source"] = source
        genome["domain"] = "Eukaryota"
        genome["genetic_code"] = 1
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]

        if taxon_reference is not None:
            genome["taxon_ref"] = taxon_reference
            genome["taxonomy"] = taxonomy

        gff_file_to_shock = self.dfu.file_to_shock({
            'file_path': input_gff_file,
            'make_handle': 1,
            'pack': "gzip"
        })
        gff_handle_ref = gff_file_to_shock['handle']['hid']

        genome['gff_handle_ref'] = gff_handle_ref

        return genome

    def _convert_ftr_object(self, old_ftr, contig):
        new_ftr = dict()
        new_ftr["id"] = old_ftr["ID"]

        dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]],
                           IUPAC.ambiguous_dna)

        # reverse complement
        if (old_ftr["strand"] == "-"):
            dna_sequence = dna_sequence.reverse_complement()
            old_start = old_ftr["start"]
            old_ftr["start"] = old_ftr["end"]
            old_ftr["end"] = old_start

        new_ftr["dna_sequence"] = str(dna_sequence).upper()
        new_ftr["dna_sequence_length"] = len(dna_sequence)
        new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest()
        new_ftr["location"] = [[
            old_ftr["contig"], old_ftr["start"], old_ftr["strand"],
            len(dna_sequence)
        ]]
        new_ftr["type"] = old_ftr["type"]

        new_ftr["aliases"] = list()
        for key in ("transcriptId", "proteinId", "PACid", "pacid"):
            if (key in old_ftr.keys()):
                new_ftr["aliases"].append(key + ":" + old_ftr[key])

        return new_ftr

    def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence):

        #create copies of locations and transcript
        utrs_exons = list(exons)
        utr_exon_sequence = exon_sequence

        five_prime_dna_sequence = ""
        three_prime_dna_sequence = ""
        five_prime_locations = list()
        three_prime_locations = list()

        for UTR in (utr_list):
            contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"]
            UTR_ftr = self._convert_ftr_object(
                UTR, contig_sequence
            )  #reverse-complementation for negative strands done here

            #aggregate sequences and locations
            if ("five_prime" in UTR_ftr["id"]):
                five_prime_dna_sequence += UTR_ftr["dna_sequence"]
                five_prime_locations.append(UTR_ftr["location"][0])
            if ("three_prime" in UTR_ftr["id"]):
                three_prime_dna_sequence += UTR_ftr["dna_sequence"]
                three_prime_locations.append(UTR_ftr["location"][0])

        #Handle five_prime UTRs
        if (len(five_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file)
            five_prime_locations = sorted(five_prime_locations,
                                          key=lambda x: x[1])

            #Merge last UTR with CDS if "next" to each other
            if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \
                ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ):

                #Remove last UTR
                last_five_prime_location = five_prime_locations[-1]
                five_prime_locations = five_prime_locations[:-1]

                #"Add" last UTR to first exon
                utrs_exons[0][1] = last_five_prime_location[1]
                utrs_exons[0][3] += last_five_prime_location[3]

            #Prepend other UTRs if available
            if (len(five_prime_locations) > 0):
                utrs_exons = five_prime_locations + utrs_exons

        utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence

        #Handle three_prime UTRs
        if (len(three_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file
            three_prime_locations = sorted(three_prime_locations,
                                           key=lambda x: x[1])

            #Merge first UTR with CDS if "next to each other
            if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \
                ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ):

                #Remove first UTR
                first_three_prime_location = three_prime_locations[0]
                three_prime_locations = three_prime_locations[1:]

                #"Add" first UTR to last exon
                utrs_exons[-1][3] += first_three_prime_location[3]

        #Append other UTRs if available
        if (len(three_prime_locations) > 0):
            utrs_exons = utrs_exons + three_prime_locations

        utr_exon_sequence += three_prime_dna_sequence

        return (utrs_exons, utr_exon_sequence)

    def _cds_aggregation_translation(self, cds_list, feature_list, assembly,
                                     issues):

        dna_sequence = ""
        locations = list()

        # collect phases, and lengths of exons
        # right now, this is only for the purpose of error reporting
        phases = list()
        exons = list()

        #Saving parent mRNA identifier
        Parent_mRNA = cds_list[0]["id"]
        for CDS in (cds_list):
            ftr = feature_list[CDS["index"]]
            phases.append(ftr["phase"])
            Parent_mRNA = ftr["Parent"]

            contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"]
            CDS_ftr = self._convert_ftr_object(
                ftr, contig_sequence
            )  #reverse-complementation for negative strands done here
            exons.append(len(CDS_ftr["dna_sequence"]))

            # Remove base(s) according to phase, but only for first CDS
            if (CDS == cds_list[0] and int(ftr["phase"]) != 0):
                log("Adjusting phase for first CDS: " + CDS["id"])
                CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][
                    int(ftr["phase"]):]

            #aggregate sequences and locations
            dna_sequence += CDS_ftr["dna_sequence"]
            locations.append(CDS_ftr["location"][0])

        # translate sequence
        dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna)
        rna_sequence = dna_sequence_obj.transcribe()

        # incomplete gene model with no start codon
        if str(rna_sequence.upper())[:3] not in codon_table.start_codons:
            msg = "Missing start codon for {}. Possibly incomplete gene model.".format(
                Parent_mRNA)
            log(msg)

        # You should never have this problem, needs to be reported rather than "fixed"
        codon_count = len(str(rna_sequence)) % 3
        if codon_count != 0:
            msg = "Number of bases for RNA sequence for {} ".format(
                Parent_mRNA)
            msg += "is not divisible by 3. "
            msg += "The resulting protein may well be mis-translated."
            log(msg)
            issues.append(Parent_mRNA)

        protein_sequence = Seq("")
        try:
            protein_sequence = rna_sequence.translate()
        except CodonTable.TranslationError as te:
            log("TranslationError for: " + feature_object["id"], phases, exons,
                " : " + str(te))

        return (locations, dna_sequence.upper(), str(protein_sequence).upper())
class ExpressionUtils:
    '''
    Module Name:
    ExpressionUtils

    Module Description:
    A KBase module: ExpressionUtils

This module is intended for use by Assemblers to upload RNASeq Expression files
(gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent.
The expression files are uploaded as a single compressed file.This module also generates
expression levels and tpm expression levels from the input files and saves them in the
workspace object. Once uploaded, the expression files can be downloaded onto an output directory.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.1.1"
    GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git"
    GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99"

    #BEGIN_CLASS_HEADER

    PARAM_IN_SRC_DIR = 'source_dir'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_ALIGNMENT_REF = 'alignment_ref'

    PARAM_IN_GENOME_REF = 'genome_ref'
    PARAM_IN_ANNOTATION_ID = 'annotation_id'
    PARAM_IN_BAM_FILE_PATH = 'bam_file_path'
    PARAM_IN_DESCRIPTION = 'description'
    PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level'
    PARAM_IN_PROC_COMMENTS = 'processing_comments'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'
    PARAM_IN_ORIG_MEDIAN = 'original_median'
    PARAM_IN_EXT_SRC_DATE = 'external_source_date'
    PARAM_IN_TRANSCRIPTS = 'transcripts'
    PARAM_IN_SRC = 'source'

    def _check_required_param(self, in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _proc_upload_expression_params(self, ctx, params):
        """
        Check the presence and validity of upload expression params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR,
            self.PARAM_IN_ALIGNMENT_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        source_dir = params.get(self.PARAM_IN_SRC_DIR)

        if not (os.path.isdir(source_dir)):
            raise ValueError('Source directory does not exist: ' + source_dir)

        if not os.listdir(source_dir):
            raise ValueError('Source directory is empty: ' + source_dir)

        return ws_name_id, obj_name_id, source_dir

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _get_genome_ref(self, assembly_or_genome_ref, params):
        if self.PARAM_IN_GENOME_REF in params and params[
                self.PARAM_IN_GENOME_REF] is not None:
            return params[self.PARAM_IN_GENOME_REF]

        obj_type = self._get_ws_info(assembly_or_genome_ref)[2]
        if obj_type.startswith('KBaseGenomes.Genome'):
            return assembly_or_genome_ref

        raise ValueError('Alignment object does not contain genome_ref; '
                         '"{}" parameter is required'.format(
                             self.PARAM_IN_GENOME_REF))

    def _get_expression_levels(self,
                               source_dir,
                               genome_ref,
                               transcripts=False):

        fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking')
        if transcripts:
            fpkm_file_path = os.path.join(source_dir, 't_data.ctab')

        if not os.path.isfile(fpkm_file_path):
            raise ValueError('{} file is required'.format(fpkm_file_path))

        id_col = 5 if transcripts else 0
        self.__LOGGER.info(
            'Generating expression levels from {}'.format(fpkm_file_path))
        return self.expression_utils.get_expression_levels(
            fpkm_file_path, genome_ref, id_col)

    def _gen_ctab_files(self, params, alignment_ref):

        source_dir = params.get(self.PARAM_IN_SRC_DIR)
        if len(glob.glob(source_dir + '/*.ctab')) < 5:

            self.__LOGGER.info(' =======  Generating ctab files ==========')
            gtf_file = os.path.join(source_dir, 'transcripts.gtf')
            if not os.path.isfile(gtf_file):
                raise ValueError(
                    "{} file is required to generate ctab files, found missing"
                    .format(gtf_file))

            if self.PARAM_IN_BAM_FILE_PATH in params and \
               params[self.PARAM_IN_BAM_FILE_PATH] is not None:
                bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH]
            else:
                self.__LOGGER.info(
                    'Downloading bam file from alignment object')
                rau = ReadsAlignmentUtils(self.callback_url)
                alignment_retVal = rau.download_alignment(
                    {'source_ref': alignment_ref})
                alignment_dir = alignment_retVal.get('destination_dir')

                allbamfiles = glob.glob(alignment_dir + '/*.bam')
                if len(allbamfiles) == 0:
                    raise ValueError('bam file does not exist in {}'.format(d))
                elif len(allbamfiles) == 1:
                    bam_file_path = allbamfiles[0]
                elif len(allbamfiles) > 1:
                    tmp_file_path = os.path.join(alignment_dir,
                                                 'accepted_hits.bam')
                    if os.path.isfile(tmp_file_path):
                        bam_file_path = tmp_file_path
                    else:
                        tmp_file_path = os.path.join(
                            alignment_dir, 'accepted_hits_sorted.bam')
                        if os.path.isfile(tmp_file_path):
                            bam_file_path = tmp_file_path
                        else:
                            raise ValueError(
                                'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}'
                                .format(alignment_dir))

            result = self.table_maker.build_ctab_files(
                ref_genome_path=gtf_file,
                alignment_path=bam_file_path,
                output_dir=source_dir)
            if result != 0:
                raise ValueError('Tablemaker failed')

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.__LOGGER = logging.getLogger('ExpressionUtils')
        self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s"
        )
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.config['SDK_CALLBACK_URL'] = self.callback_url
        self.expression_utils = Expression_Utils(self.config)
        self.dfu = DataFileUtil(self.callback_url)
        self.table_maker = TableMaker(config, self.__LOGGER)
        self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER)
        #END_CONSTRUCTOR
        pass

    def upload_expression(self, ctx, params):
        """
        Uploads the expression  *
        :param params: instance of type "UploadExpressionParams" (*   
           Required input parameters for uploading a reads expression data
           string   destination_ref        -   object reference of expression
           data. The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id string   source_dir             -  
           directory with the files to be uploaded string   alignment_ref    
           -   alignment workspace object reference *) -> structure:
           parameter "destination_ref" of String, parameter "source_dir" of
           String, parameter "alignment_ref" of String, parameter
           "genome_ref" of String, parameter "annotation_id" of String,
           parameter "bam_file_path" of String, parameter "transcripts" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
           1)), parameter "data_quality_level" of Long, parameter
           "original_median" of Double, parameter "description" of String,
           parameter "platform" of String, parameter "source" of String,
           parameter "external_source_date" of String, parameter
           "processing_comments" of String
        :returns: instance of type "UploadExpressionOutput" (*     Output
           from upload expression    *) -> structure: parameter "obj_ref" of
           String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_expression

        self.__LOGGER.info('Starting upload expression, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params(
            ctx, params)

        alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF)
        try:
            alignment_obj = self.dfu.get_objects(
                {'object_refs': [alignment_ref]})['data'][0]
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        alignment = alignment_obj['data']
        assembly_or_genome_ref = alignment['genome_id']

        genome_ref = self._get_genome_ref(assembly_or_genome_ref, params)

        expression_levels, tpm_expression_levels = self._get_expression_levels(
            source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS))

        self._gen_ctab_files(params, alignment_ref)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': source_dir,
            'make_handle': 1,
            'pack': 'zip'
        })
        """
        move the zipfile created in the source directory one level up
        """
        path, dir = os.path.split(source_dir)
        zipfile = dir + '.zip'
        if os.path.isfile(os.path.join(source_dir, zipfile)):
            shutil.move(os.path.join(source_dir, zipfile),
                        os.path.join(path, zipfile))

        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        expression_data = {
            'numerical_interpretation': 'FPKM',
            'genome_id': genome_ref,
            'mapped_rnaseq_alignment': {
                alignment['read_sample_id']: alignment_ref
            },
            'condition': alignment['condition'],
            'file': file_handle,
            'expression_levels': expression_levels,
            'tpm_expression_levels': tpm_expression_levels
        }
        additional_params = [
            self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION,
            self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM,
            self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID,
            self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE,
            self.PARAM_IN_SRC
        ]

        for opt_param in additional_params:
            if opt_param in params and params[opt_param] is not None:
                expression_data[opt_param] = params[opt_param]

        extra_provenance_input_refs = list()
        extra_provenance_input_refs.append(
            params.get(self.PARAM_IN_ALIGNMENT_REF))
        if self.PARAM_IN_GENOME_REF in params and params.get(
                self.PARAM_IN_GENOME_REF) is not None:
            extra_provenance_input_refs.append(
                params.get(self.PARAM_IN_GENOME_REF))

        self.__LOGGER.info('===========   Adding extra_provenance_refs')
        self.__LOGGER.info(str(extra_provenance_input_refs))
        self.__LOGGER.info('==========================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqExpression",
                "data":
                expression_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs":
                extra_provenance_input_refs
            }]
        })[0]

        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        print(returnVal)
        #END upload_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_expression(self, ctx, params):
        """
        Downloads expression *
        :param params: instance of type "DownloadExpressionParams" (*
           Required input parameters for downloading expression string
           source_ref         -       object reference of expression source.
           The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id *) -> structure: parameter "source_ref" of
           String
        :returns: instance of type "DownloadExpressionOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_expression

        self.__LOGGER.info('Running download_expression with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'download_' + str(timestamp))
        os.mkdir(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            expression[0]['data']['file']['id'],
            'file_path':
            output_dir,
            'unpack':
            'unpack'
        })

        if not os.listdir(output_dir):
            raise ValueError('No files were downloaded: ' + output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        returnVal = {'destination_dir': output_dir}

        #END download_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_expression(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download expressions from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting expression string   source_ref         - 
           object reference of expression source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_expression

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        output = {'shock_id': expression[0]['data']['file']['id']}

        #END export_expression

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_expression return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_expressionMatrix(self, ctx, params):
        """
        :param params: instance of type "getExprMatrixParams" (* Following
           are the required input parameters to get Expression Matrix *) ->
           structure: parameter "workspace_name" of String, parameter
           "output_obj_name" of String, parameter "expressionset_ref" of
           String
        :returns: instance of type "getExprMatrixOutput" -> structure:
           parameter "exprMatrix_FPKM_ref" of String, parameter
           "exprMatrix_TPM_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_expressionMatrix
        fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix(
            params)

        returnVal = {
            'exprMatrix_FPKM_ref': fpkm_ref,
            'exprMatrix_TPM_ref': tpm_ref
        }
        #END get_expressionMatrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_expressionMatrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#19
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file
        """
        path, file = os.path.split(bam_file)
        return self.samtools.get_stats(file, path)

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment  *
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        uuid_prefix = uuid_str[:8]
        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get('downloadBAI', False):
                bai_file = uuid_prefix + '_' + file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get('downloadSAM', False):
                sam_file = uuid_prefix + '_' + file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref - 
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.iteritems():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
示例#20
0
class Utils:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.gen_api = GenericsAPI(self.callback_url)
        self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom"
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def get_conditions(self, params):
        data = self.dfu.get_objects(
            {'object_refs': [params['condition_set_ref']]})['data'][0]['data']
        conditions = {}
        keep_keys = params.get('conditions', data['conditions'].keys())
        for key in keep_keys:
            conditions[key] = defaultdict(list)
            for factor, val in zip(data['factors'], data['conditions'][key]):
                ont_abriv = factor['factor_ont_id'].split(":")[0]
                factor['value'] = val
                conditions[key][ont_abriv].append(copy.copy(factor))
        return {"conditions": conditions}

    def file_to_condition_set(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep="\t", dtype='str')
        comp_set = self._df_to_cs_obj(df)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.ConditionSet",
                "data": comp_set,
                "name": params['output_obj_name']
            }]
        })[0]
        return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])}

    def _conditionset_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        factors = pd.DataFrame(data['factors'])
        factors.rename(columns=lambda x: x.replace("ont", "ontology").
                       capitalize().replace("_", " "))
        conditions = pd.DataFrame(data['conditions'])
        cs_df = factors.join(conditions)

        return cs_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.gen_api.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            cluster.get('id_to_data_position').keys() for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on condition
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = cluster.get('id_to_data_position').keys()
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a dataframe"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.ConditionSet" in obj_type:
            cs_df = self._conditionset_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _df_to_cs_obj(self, cs_df):
        """Converts a dataframe from a user file to a compound set object"""
        condition_set = {'ontology_mapping_method': "User Curation"}
        cs_df.fillna('', inplace=True)
        if not len(cs_df):
            raise ValueError("No factors in supplied files")
        factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor")
        condition_df = cs_df.drop(factor_df.columns, axis=1)
        if not len(condition_df.columns):
            raise ValueError(
                "Unable to find any condition columns in supplied file")

        factor_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "factor" not in factor_df.columns:
            raise ValueError(
                "Unable to find a 'Factor' column in supplied file")
        factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id')
        factors = factor_df.filter(items=factor_fields).to_dict('records')

        condition_set['factors'] = [
            self._add_ontology_info(f) for f in factors
        ]
        condition_set['conditions'] = condition_df.to_dict('list')
        return condition_set

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, factor):
        """Searches KBASE ontologies for terms matching the user supplied factors and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        factor = {
            k: v
            for k, v in factor.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            factor.get('factor_ont_id', "").replace("_", ":"))
        if ont_info:
            factor['factor_ont_ref'] = ont_info['ontology_ref']
            factor['factor_ont_id'] = ont_info['id']
        else:
            factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
            factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID

        if factor.get('unit'):
            ont_info = self._search_ontologies(
                factor.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                factor['unit_ont_ref'] = ont_info['ontology_ref']
                factor['unit_ont_id'] = ont_info['id']
            else:
                factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
                factor['unit_ont_id'] = self.DEFAULT_UNIT_ID
        return factor

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.ConditionSet" in obj_type:
            df.to_excel(writer, "Conditions", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
示例#21
0
class EmmaxUtil:

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['token']
        self.shock_url = config['shock-url']
        self.scratch = os.path.join(config['scratch'], 'emmax_assoc_'+str(uuid.uuid4()))
        os.mkdir(self.scratch)
        self.config = config
        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url, token=self.config['token'])
    
    def _validate_phenotype_file(self, phenotype_filepath, tfam_filepath, case_control=False):
        #TODO: check if case/control.  If so, recode to 2/1 if necessary.  
        #TODO: verify the number of samples in pheno file matches tfam count
        #TODO: verify the order of the two files matches.  If not, reorder pheno
        #TODO: verify the FID and wiFID match the tfam
        pass


    def _validate_emmax_params(self, params):
        #TODO: All manner of param validation 
        pass

    def _create_tsv_file(self, top_snp_filepath, tsv_filename):
        log("Generating tsv file from {}".format(top_snp_filepath))
        cols = ['SNP', 'CHR', 'BP', 'P']
        snpData = pd.read_csv(top_snp_filepath, delimiter='\t', header=None, names=['ID', 'SE', 'p'])

        tsvData = pd.DataFrame(columns = cols)

        tsvData['SNP'] = snpData.ID 
        tsvData['CHR'] = snpData.ID.str[1:2]
        tsvData['P'] = snpData.p

        tsvData.sort_values(by='CHR', inplace=True)

        bps = []
        current_chr = tsvData.iloc[0]['CHR']
        count = 1
        for idx, row in tsvData.iterrows():
            if current_chr != row['CHR']:
                current_chr = row['CHR']
                count = 1
            bps.append(count)
            count = count + 1
        tsvData['BP'] = bps
        tsv_filepath = os.path.join(self.scratch, tsv_filename)
        tsvData.to_csv(tsv_filepath, sep='\t', index=False)        
        return tsv_filepath
        
    def _copyDirectory(self, src, dest):
            try:
                 shutil.copytree(src, dest)
                 # Directories are the same
            except shutil.Error as e:
                 print('Directory not copied. Error: %s' % e)
            # Any error saying that the directory doesn't exist
            except OSError as e:
                 print('Directory not copied. Error: %s' % e)

    def _run_subprocess(self, command, print_output=False, use_shell=False):
        log("Executing command:\n{}\n".format(command))
        p = subprocess.Popen(command,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=use_shell)
        output, errors = p.communicate()
        if print_output:
            print("Command output:\n{}".format(output))
        if errors:
            print("Command error output\n{}".format(errors))
        if p.returncode != 0:
            error_msg = "Error running command:\n{}\nReturn code: {}".format(command, p.returncode)
            raise ValueError(error_msg)

    def _download_variation_file(self, variation_obj_ref):
        log("Retrieving Variation Object {}...".format(variation_obj_ref))
        
        try:
            variation_shock_id = self.dfu.get_objects({
                'object_refs': [variation_obj_ref]
            })['data'][0]['data']['variation_file_reference']

            self.dfu.shock_to_file({
                'shock_id': variation_shock_id,
                'file_path': self.scratch,
                'unpack' : 'unpack'
            })
        except Exception as e:
            log("Error while retrieving Variation Object {}".format(variation_obj_ref))
            log(e)
            raise ValueError(e)

        variation_filename = [f for f in os.listdir(self.scratch) if f.endswith('.vcf')][0]
        variation_filepath = os.path.join(self.scratch, variation_filename)
        log("Variation file successfully downloaded to {}".format(variation_filepath))
        return variation_filepath 

    def _move_phenotype_data(self, pheno_filename):
        """ This is here until we get a Phenotype/Trait object working """
        pheno_filepath = os.path.join(self.scratch, pheno_filename)
        shutil.copy('/kb/module/data/' + pheno_filename, pheno_filepath)
        return pheno_filepath

    def _convert_vcf_to_plink(self, variation_filepath, fam_id = '--double-id', plink_file_prefix='plink_out'):
        #FIXME: This function should probably be in VCF utils
        log("Generating PLINK .tfam and .tped from VCF...")
        plink_cmd = ['plink']
        plink_cmd.append('--vcf')
        plink_cmd.append(variation_filepath)
        plink_cmd.append('--recode12')
        plink_cmd.append('transpose')
        plink_cmd.append('--output-missing-genotype')
        plink_cmd.append('0')
        plink_cmd.append(fam_id)
        plink_cmd.append('--out')
        plink_cmd.append(plink_file_prefix)

        self._run_subprocess(plink_cmd, print_output=True)

    def _generate_kinship_matrix(self, tped_prefix, matrix_type='BN'):
        #TODO: check for existence of files before attempting matrix generation

        log("Generating {} Kinship Matrix...".format(matrix_type))
        kinship_cmd = ['emmax-kin', '-v', '-d', '10']
        if (matrix_type == 'BN'):
            # kinship_cmd.append('-d')
            # kinship_cmd.append('10')
            pass
        elif (matrix_type == 'IBS'):
            kinship_cmd.append('-s')
            # kinship_cmd.append('-d')
            # kinship_cmd.append('10')
        else:
            log("Invalid matrix type specified.  Aborting")
            raise ValueError("Invalid matrix type specified")
        kinship_cmd.append(tped_prefix)
        self._run_subprocess(kinship_cmd, print_output=True)

        kinship_matrix_filename = [f for f in os.listdir(self.scratch) if f.endswith('.kinf')][0]
        kinship_matrix_filepath = os.path.join(self.scratch, kinship_matrix_filename)
        log("Variation file successfully downloaded to {}".format(kinship_matrix_filepath))
        return kinship_matrix_filepath

    def _emmax_association(self, plink_prefix, pheno_filepath, kinship_filepath, emmax_params):
        log("Running EMMAX association analysis...")
        emmax_cmd = ['emmax']
        for param in emmax_params:
            emmax_cmd.append(param)
        emmax_cmd.append('-t')
        emmax_cmd.append(plink_prefix)
        emmax_cmd.append('-p')
        emmax_cmd.append(pheno_filepath)
        emmax_cmd.append('-k')
        emmax_cmd.append(kinship_filepath)
        emmax_cmd.append('-o')
        emmax_cmd.append(plink_prefix)

        self._run_subprocess(emmax_cmd)
        emmax_filenames = [f for f in os.listdir(self.scratch) if f.endswith('.reml') or f.endswith('.ps')]
        return emmax_filenames
        
    def _select_top_snps(self, count, ps_filepath, output_filepath):
        #select_cmd = ['awk', '{print $NF, $0}', ps_filepath, "|", "sort", "-n", "|", "cut", "-f2-", "-d' '"]
        select_cmd = ["awk '{{print $NF,$0}}' {} | sort -n | cut -f2- -d' ' | sed -n -e '1,{}p' > {}".format(ps_filepath, str(count), output_filepath)]
        self._run_subprocess(select_cmd, print_output=False, use_shell=True)

    def _generate_output_files(self):
        log('Ziping EMMAX .reml and .ps files...')
        output_files = list()
        allowed_extensions = ['.ps', '.reml', '.tsv']
        result_file = os.path.join(self.scratch, 'emmax_results.zip')
        with zipfile.ZipFile(result_file, 'w',
                                zipfile.ZIP_DEFLATED,
                                allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(self.scratch):
                for file in files:
                    if (file.endswith(tuple(allowed_extensions))):
                        if file in zip_file.namelist():
                            continue
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'File(s) generated by EMMAX'})
        log("Importer output generated: {}".format(output_files))
        return output_files

    def _generate_html_report(self, template_dir, tsv_filepath):
        log("Generating HTML report...")
        html_report = []

        output_dir = os.path.join(self.scratch, 'html')

        self._copyDirectory(template_dir, output_dir)

        result_file_path = os.path.join(output_dir, 'index.html')

        shutil.copyfile(tsv_filepath, os.path.join(output_dir, 'emmax_top.tsv'))
        report_shock_id = self.dfu.file_to_shock({'file_path': output_dir,
                'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                'name': os.path.basename(result_file_path),
                'label': os.path.basename(result_file_path),
                'description': 'Manhattan plot'})
        return html_report

    def run_emmax_association(self, params):
        variation_filepath = self._download_variation_file(params['variation_obj_ref'])
        self._convert_vcf_to_plink(variation_filepath, '--double-id', params['output_file_prefix'])
        
        pheno_filepath = self._move_phenotype_data('flcReordered.pheno')
        kinship_filepath = self._generate_kinship_matrix(params['output_file_prefix'])
        
        emmax_params = ['-v', '-d', '10']
        emmax_assoc_files = self._emmax_association(params['output_file_prefix'], pheno_filepath, kinship_filepath, emmax_params)
        
        full_result_filepath = os.path.join(self.scratch, params['output_file_prefix'] + '.ps')
        top_snp_filepath = os.path.join(self.scratch, TOP_SNP_FN)
        self._select_top_snps(params['snp_return_count'], full_result_filepath, top_snp_filepath)
        tsv_filepath = self._create_tsv_file(top_snp_filepath, 'emmax_top.tsv')
        output_html_files = self._generate_html_report(TEMPLATE_DIRECTORY, tsv_filepath)
        output_emmax_files = self._generate_output_files()
        report_params = {
            'message': '',
            'workspace_name' : params.get('workspace_name'),
            'file_links' : output_emmax_files,
            'html_links' : output_html_files,
            'direct_html_link_index' : 0,
            'html_window_height' : 333,
            'report_object_name' : 'emmax_assoc_html_report_' + str(uuid.uuid4())
        }
        kbr_output = self.kbr.create_extended_report(report_params)
        report_output = {
            'report_name': kbr_output['name'],
            'report_ref': kbr_output['ref'],
        }
        log("EMMAX report generated successfully!")
        return report_output