예제 #1
0
    def download_assembly(self, token, assembly_ref):
        try:
            auClient = AUClient(self.callback_url, token=token, service_ver=self.SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callback_url: '+ self.callback_url +' ERROR: ' + str(e))
        try:
            dfuClient = DFUClient(self.callback_url, token=token, service_ver=self.SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate dfuClient with callback_url: '+ self.callback_url +' ERROR: ' + str(e))

        contig_file = auClient.get_assembly_as_fasta({'ref':assembly_ref}).get('path')
        sys.stdout.flush()   # don't remember why this matters
        contig_file_path = dfuClient.unpack_file({'file_path': contig_file})['file_path']
        return contig_file_path
예제 #2
0
def upload_interleaved_reads(callback_url, reads_file, ws_name, reads_obj_name, source_reads_upa):
    """
    callback_url = as usual.
    reads_file = full path to the reads file to upload
    ws_name = the workspace to use for uploading the reads file
    reads_obj_name = the name of the new reads object to save as
    source_reads = if not None, the source UPA for the original reads file.
    """
    # unfortunately, the ReadsUtils only accepts uncompressed fq files- this should
    # be fixed on the KBase side
    dfu = DataFileUtil(callback_url)
    reads_unpacked = dfu.unpack_file({'file_path': reads_file})['file_path']

    ru = ReadsUtils(callback_url)
    new_reads_upa = ru.upload_reads({
        'fwd_file': reads_unpacked,
        'interleaved': 1,
        'wsname': ws_name,
        'name': reads_obj_name,
        'source_reads_ref': source_reads_upa
    })['obj_ref']
    print('saved ' + str(reads_unpacked) + ' to ' + str(new_reads_upa))
    return new_reads_upa
class CompoundSetUtils:
    '''
    Module Name:
    CompoundSetUtils
    Module Description:
    A KBase module: CompoundSetUtils
    Contains tools for import & export of compound sets
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "2.1.2"
    GIT_URL = "https://github.com/Tianhao-Gu/CompoundSetUtils.git"
    GIT_COMMIT_HASH = "12e1f23022354f475d7ceb3631913956eb5831a7"

    #BEGIN_CLASS_HEADER
    @staticmethod
    def _check_param(in_params, req_param, opt_param=list()):
        """
        Check if each of the params in the list are in the input params
        """
        for param in req_param:
            if param not in in_params:
                raise ValueError('{} parameter is required'.format(param))
        defined_param = set(req_param + opt_param)
        for param in in_params:
            if param not in defined_param:
                logging.warning(
                    "Received unexpected parameter {}".format(param))

    def _save_to_ws_and_report(self, ws_id, source, compoundset, message=None):
        """Save compound set to the workspace and make report"""
        info = self.dfu.save_objects({
            'id':
            ws_id,
            "objects": [{
                "type": "KBaseBiochem.CompoundSet",
                "data": compoundset,
                "name": compoundset['name']
            }]
        })[0]
        compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        if not message:
            message = 'Imported %s as %s' % (source, info[1])
        report_params = {
            'objects_created': [{
                'ref': compoundset_ref,
                'description': 'Compound Set'
            }],
            'message':
            message,
            'workspace_name':
            info[7],
            'report_object_name':
            'compound_set_creation_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'compoundset_ref': compoundset_ref
        }
        return output

    def _export_compound_set(self, ref, file_type):
        logging.info("Exporting {} as {}".format(ref, file_type))
        compoundset = self.dfu.get_objects({'object_refs':
                                            [ref]})['data'][0]['data']
        temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(temp_dir)
        out_dir = "{}/{}".format(temp_dir, compoundset['name'])
        os.mkdir(out_dir)
        target = "{}/{}.{}".format(out_dir, compoundset['name'], file_type)
        if file_type == 'tsv':
            parse.write_tsv(compoundset, target)
        elif file_type == 'sdf':
            parse.write_sdf(compoundset, target)
        else:
            raise ValueError("Bad file_type: {}".format(file_type))
        handle = self.dfu.package_for_download({
            'file_path': out_dir,
            'ws_refs': [ref]
        })
        output = {'shock_id': handle['shock_id']}
        return output

    def _fetch_mol2_files(self, ref):
        compoundset_obj = self.dfu.get_objects({'object_refs':
                                                [ref]})['data'][0]
        compoundset_info = compoundset_obj['info']
        compoundset = compoundset_obj['data']
        temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(temp_dir)

        compounds = compoundset.get('compounds')

        mol2_files = []
        comp_id_mol2_file_name_map = {}
        for compound in compounds:
            mol2_handle_ref = compound.get('mol2_handle_ref')

            if mol2_handle_ref:
                mol2_file_path = self.dfu.shock_to_file({
                    'handle_id': mol2_handle_ref,
                    'file_path': temp_dir
                }).get('file_path')
                mol2_files.append(mol2_file_path)
                comp_id_mol2_file_name_map[compound['id']] = os.path.basename(
                    mol2_file_path)

        packed_mol2_files_path = None
        if mol2_files:
            packed_mol2_files_path = os.path.join(
                temp_dir, compoundset_info[1] + '_mol2_files.zip')
            with zipfile.ZipFile(packed_mol2_files_path, 'w') as zipMe:
                for file in mol2_files:
                    zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)

        return packed_mol2_files_path, comp_id_mol2_file_name_map

    def _covert_mol2_files_to_pdbqt(self, ref):
        compoundset_obj = self.dfu.get_objects({'object_refs':
                                                [ref]})['data'][0]
        compoundset_info = compoundset_obj['info']
        compoundset = compoundset_obj['data']
        mol2_temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(mol2_temp_dir)
        pdbqt_temp_dir = "{}/{}".format(self.scratch, uuid.uuid4())
        os.mkdir(pdbqt_temp_dir)

        compounds = compoundset.get('compounds')

        pdbqt_files = []
        comp_id_pdbqt_file_name_map = {}
        for compound in compounds:
            mol2_handle_ref = compound.get('mol2_handle_ref')

            if mol2_handle_ref:
                mol2_file_path = self.dfu.shock_to_file({
                    'handle_id':
                    mol2_handle_ref,
                    'file_path':
                    mol2_temp_dir
                }).get('file_path')
                pdbqt_file_path = os.path.join(pdbqt_temp_dir,
                                               compound['id'] + '.pdbqt')

                command = [
                    'obabel', '-i', 'mol2', mol2_file_path, '-o', 'pdbqt',
                    '-O', pdbqt_file_path
                ]
                process = Popen(command, stdout=PIPE, stderr=PIPE)
                stdout, stderr = process.communicate()

                if 'converted' in str(stderr) and 'molecule' in str(stderr):
                    logging.info(
                        'Successfully converted Mol2 to pdbqt format: {}'.
                        format(os.path.basename(mol2_file_path)))
                    pdbqt_files.append(pdbqt_file_path)
                    comp_id_pdbqt_file_name_map[
                        compound['id']] = os.path.basename(pdbqt_file_path)
                else:
                    logging.warning(
                        'Cannot convert Mol2 file to pdbqt format: {}'.format(
                            os.path.basename(mol2_file_path)))
                    logging.warning(stderr)

        packed_pdbqt_files_path = None
        if pdbqt_files:
            packed_pdbqt_files_path = os.path.join(
                pdbqt_temp_dir, compoundset_info[1] + '_pdbqt_files.zip')
            with zipfile.ZipFile(packed_pdbqt_files_path, 'w') as zipMe:
                for file in pdbqt_files:
                    zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)

        return packed_pdbqt_files_path, comp_id_pdbqt_file_name_map

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def compound_set_from_file(self, ctx, params):
        """
        CompoundSetFromFile
        string staging_file_path
        :param params: instance of type "compoundset_upload_params" ->
           structure: parameter "workspace_id" of String, parameter
           "staging_file_path" of String, parameter "compound_set_name" of
           String, parameter "mol2_staging_file_path" of String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_file
        self._check_param(
            params, ['workspace_id', 'staging_file_path', 'compound_set_name'],
            opt_param=['mol2_staging_file_path'])
        scratch_file_path = self.dfu.download_staging_file({
            'staging_file_subdir_path':
            params['staging_file_path']
        }).get('copy_file_path')
        # I probably should be uploading the raw files to shock

        mol2_staging_file_path = params.get('mol2_staging_file_path')

        mol2_file_dir = None
        if mol2_staging_file_path:
            mol2_scratch_file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                mol2_staging_file_path
            }).get('copy_file_path')

            try:
                logging.info("start unpacking mol2 file")
                mol2_file_path_out = self.dfu.unpack_file(
                    {'file_path': mol2_scratch_file_path})['file_path']
                mol2_file_dir = os.path.dirname(mol2_file_path_out)
            except Exception:
                raise ValueError('Cannot unpack mol2 file: {}'.format(
                    os.path.basename(mol2_file_path_out)))

        ext = os.path.splitext(scratch_file_path)[1]
        file_name = os.path.basename(scratch_file_path)
        if ext == '.sdf':
            compounds = parse.read_sdf(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        elif ext == '.tsv':
            compounds = parse.read_tsv(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        else:
            raise ValueError('Invalid input file type. Expects .tsv or .sdf')

        compoundset = {
            'id': params['compound_set_name'],
            'name': params['compound_set_name'],
            'description': 'Compound Set produced from %s' % file_name,
            'compounds': compounds,
        }

        output = self._save_to_ws_and_report(params['workspace_id'],
                                             params['staging_file_path'],
                                             compoundset)
        #END compound_set_from_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_to_file(self, ctx, params):
        """
        CompoundSetToFile
        string compound_set_name
        string output_format
        :param params: instance of type "compoundset_download_params" ->
           structure: parameter "compound_set_ref" of String, parameter
           "output_format" of String
        :returns: instance of type "compoundset_download_results" ->
           structure: parameter "file_path" of String, parameter
           "packed_mol2_files_path" of String, parameter
           "comp_id_mol2_file_name_map" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_to_file
        self._check_param(params, ['compound_set_ref', 'output_format'])
        ret = self.dfu.get_objects(
            {'object_refs': [params['compound_set_ref']]})['data'][0]
        compoundset = ret['data']
        ext = params['output_format']
        out = f"{self.scratch}/{uuid.uuid4()}"
        os.mkdir(out)
        out += f"/{compoundset['name']}"
        if ext == 'sdf':
            outfile_path = parse.write_sdf(compoundset, out)
        elif ext == 'tsv':
            outfile_path = parse.write_tsv(compoundset, out)
        else:
            outfile_path = parse.write_mol_dir(compoundset, out, ext)

        packed_mol2_files_path, comp_id_mol2_file_name_map = self._fetch_mol2_files(
            params['compound_set_ref'])

        output = {
            'file_path': outfile_path,
            'packed_mol2_files_path': packed_mol2_files_path,
            'comp_id_mol2_file_name_map': comp_id_mol2_file_name_map
        }

        #END compound_set_to_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_to_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_from_model(self, ctx, params):
        """
        CompoundSetFromModel
        required:
        string workspace_id
        string model_ref
        string compound_set_name
        :param params: instance of type "compoundset_from_model_params" ->
           structure: parameter "workspace_id" of String, parameter
           "model_ref" of String, parameter "compound_set_name" of String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_model
        self._check_param(params,
                          ['workspace_id', 'model_ref', 'compound_set_name'])
        model = self.dfu.get_objects({'object_refs': [params['model_ref']]
                                      })['data'][0]['data']
        compounds, undef = parse.parse_model(model)
        compoundset = {
            'id':
            params['compound_set_name'],
            'name':
            params['compound_set_name'],
            'description':
            'Compound Set produced from %s, a metabolic model' % model['id'],
            'compounds':
            compounds,
        }

        output = self._save_to_ws_and_report(params['workspace_id'],
                                             model['name'], compoundset)
        #END compound_set_from_model

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_model return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_compoundset_as_tsv(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_compoundset_as_tsv
        output = self._export_compound_set(params['input_ref'], 'tsv')
        #END export_compoundset_as_tsv

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_compoundset_as_tsv return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_compoundset_as_sdf(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_compoundset_as_sdf
        output = self._export_compound_set(params['input_ref'], 'sdf')
        #END export_compoundset_as_sdf

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_compoundset_as_sdf return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_compoundset_mol2_files(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "export_mol2_files_results" -> structure:
           parameter "packed_mol2_files_path" of String, parameter
           "comp_id_mol2_file_name_map" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_compoundset_mol2_files
        self._check_param(params, ['input_ref'])

        packed_mol2_files_path, comp_id_mol2_file_name_map = self._fetch_mol2_files(
            params['input_ref'])

        output = {
            'packed_mol2_files_path': packed_mol2_files_path,
            'comp_id_mol2_file_name_map': comp_id_mol2_file_name_map
        }
        #END export_compoundset_mol2_files

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method export_compoundset_mol2_files return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]

    def convert_compoundset_mol2_files_to_pdbqt(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "convert_mol2_files_results" -> structure:
           parameter "packed_pdbqt_files_path" of String, parameter
           "comp_id_pdbqt_file_name_map" of mapping from String to String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN convert_compoundset_mol2_files_to_pdbqt
        self._check_param(params, ['input_ref'])

        packed_pdbqt_files_path, comp_id_pdbqt_file_name_map = self._covert_mol2_files_to_pdbqt(
            params['input_ref'])

        output = {
            'packed_pdbqt_files_path': packed_pdbqt_files_path,
            'comp_id_pdbqt_file_name_map': comp_id_pdbqt_file_name_map
        }
        #END convert_compoundset_mol2_files_to_pdbqt

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method convert_compoundset_mol2_files_to_pdbqt return value '
                + 'output is not type dict as required.')
        # return the results
        return [output]

    def fetch_mol2_files_from_zinc(self, ctx, params):
        """
        :param params: instance of type "FetchZINCMol2Params" -> structure:
           parameter "workspace_id" of String, parameter "compoundset_ref" of
           type "obj_ref", parameter "over_write" of Long
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN fetch_mol2_files_from_zinc
        self._check_param(params, ['workspace_id', 'compoundset_ref'],
                          opt_param=['over_write'])
        over_write = params.get('over_write', False)
        compoundset = self.dfu.get_objects(
            {'object_refs': [params['compoundset_ref']]})['data'][0]['data']

        compoundset_copy = copy.deepcopy(compoundset)

        count = 0
        for compound in compoundset_copy.get('compounds'):
            if not compound.get('mol2_handle_ref') or over_write:
                temp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
                os.mkdir(temp_dir)
                mol2_file_path = os.path.join(temp_dir, compound.get('id'))
                inchikey = compound.get('inchikey')
                if zinc_db_util.inchikey_to_mol2(inchikey, mol2_file_path):
                    handle_id = self.dfu.file_to_shock({
                        'file_path': mol2_file_path,
                        'make_handle': True
                    })['handle']['hid']
                    compound['mol2_handle_ref'] = handle_id
                    compound['mol2_source'] = 'ZINC15'
                    count += 1
                else:
                    logging.warning(
                        'Cannot find Mol2 file from ZINC for {}'.format(
                            inchikey))

        if count:
            message = 'Successfully fetched {} Mol2 files from ZINC database'.format(
                count)
        else:
            message = 'Fetched 0 Mol2 files from ZINC database. The CompoundSet object remains unchanged.'

        output = self._save_to_ws_and_report(params['workspace_id'],
                                             '',
                                             compoundset_copy,
                                             message=message)

        #END fetch_mol2_files_from_zinc

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method fetch_mol2_files_from_zinc return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
예제 #4
0
class GenbankToGenome:
    def __init__(self, config):
        self.cfg = config
        self.gi = GenomeInterface(config)
        self.dfu = DataFileUtil(config.callbackURL)
        self.aUtil = AssemblyUtil(config.callbackURL)
        self.ws = Workspace(config.workspaceURL)
        self._messages = []
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.generate_parents = False
        self.generate_ids = False
        self.genes = OrderedDict()
        self.mrnas = OrderedDict()
        self.cdss = OrderedDict()
        self.noncoding = []
        self.ontologies_present = defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = Counter()
        self.feature_counts = Counter()
        self.orphan_types = Counter()
        self.contig_seq = {}
        self.circ_contigs = set()
        self.features_spaning_zero = set()
        self.genome_warnings = []
        self.genome_suspect = False
        self.defects = Counter()
        self.spoofed_genes = 0
        self.excluded_features = ('source', 'exon', 'fasta_record')
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.re_api_url = config.re_api_url
        # dict with feature 'id's that have been used more than once.
        self.used_twice_identifiers = {}
        self.default_params = {
            'source':
            'Genbank',
            'taxon_wsname':
            self.cfg.raw['taxon-workspace-name'],
            'taxon_lookup_obj_name':
            self.cfg.raw['taxon-lookup-object-name'],
            'ontology_wsname':
            self.cfg.raw['ontology-workspace-name'],
            'ontology_GO_obj_name':
            self.cfg.raw['ontology-gene-ontology-obj-name'],
            'ontology_PO_obj_name':
            self.cfg.raw['ontology-plant-ontology-obj-name'],
            'release':
            None,
            'genetic_code':
            11,
            'generate_ids_if_needed':
            0,
            'metadata': {}
        }

    @property
    def messages(self):
        return "\n".join(self._messages)

    def refactored_import(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) construct the input directory staging area
        input_directory = self.stage_input(params)

        # 3) update default params
        self.default_params.update(params)
        params = self.default_params
        self.generate_parents = params.get('generate_missing_genes')
        self.generate_ids = params.get('generate_ids_if_needed')
        if params.get('genetic_code'):
            self.code_table = params['genetic_code']

        # 4) Do the upload
        files = self._find_input_files(input_directory)
        consolidated_file = self._join_files_skip_empty_lines(files)
        genome = self.parse_genbank(consolidated_file, params)
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params['metadata'],
        })
        ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}"
        logging.info(f"Genome saved to {ref}")

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {'genome_ref': ref, 'genome_info': info}

        return details

    @staticmethod
    def validate_params(params):
        if 'workspace_name' not in params:
            raise ValueError('required "workspace_name" field was not defined')
        if 'genome_name' not in params:
            raise ValueError('required "genome_name" field was not defined')
        if 'file' not in params:
            raise ValueError('required "file" field was not defined')

        # one and only one of 'path', 'shock_id', or 'ftp_url' is required
        file = params['file']
        if not isinstance(file, dict):
            raise ValueError('required "file" field must be a map/dict')
        sources = ('path', 'shock_id', 'ftp_url')
        n_valid_fields = sum(1 for f in sources if file.get(f))
        if n_valid_fields < 1:
            raise ValueError(f'required "file" field must include one source: '
                             f'{", ".join(sources)}')
        if n_valid_fields > 1:
            raise ValueError(
                f'required "file" field has too many sources specified: '
                f'{", ".join(file.keys())}')
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(f"Invalid genetic code specified: {params}")

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and uncompressing if needed. """

        # construct the input directory where we stage files
        input_directory = os.path.join(
            self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}')
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if file.get('path') is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            logging.info(
                f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}'
            )
            sys.stdout.flush()
            file_name = self.dfu.shock_to_file({
                'file_path': input_directory,
                'shock_id': file['shock_id']
            })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            logging.info('Downloading file from: ' + str(file['ftp_url']))
            local_file_path = self.dfu.download_web_file({
                'file_url':
                file['ftp_url'],
                'download_type':
                'FTP'
            })['copy_file_path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            logging.info("staged input file =" + genbank_file_path)
            self.dfu.unpack_file({'file_path': genbank_file_path})

        else:
            raise ValueError(
                'No valid files could be extracted based on the input')

        return input_directory

    def parse_genbank(self, file_path, params):
        logging.info("Saving original file to shock")
        shock_res = self.dfu.file_to_shock({
            'file_path': file_path,
            'make_handle': 1,
            'pack': 'gzip',
        })
        # Write and save assembly file
        assembly_ref = self._save_assembly(file_path, params)
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']
        genome = {
            "id": params['genome_name'],
            "original_source_file_name": os.path.basename(file_path),
            "assembly_ref": assembly_ref,
            "gc_content": assembly_data['gc_content'],
            "dna_size": assembly_data['dna_size'],
            "md5": assembly_data['md5'],
            "genbank_handle_ref": shock_res['handle']['hid'],
            "publications": set(),
            "contig_ids": [],
            "contig_lengths": [],
        }
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            params['source'])

        if params.get('genome_type'):
            genome['genome_type'] = params['genome_type']

        # Set taxonomy-related fields in the genome
        # Also validates the given taxon ID
        if params.get('taxon_id'):
            set_taxon_data(int(params['taxon_id']), self.re_api_url, genome)
        else:
            set_default_taxon_data(genome)

        dates = []
        # Parse data from genbank file
        contigs = Bio.SeqIO.parse(file_path, "genbank")
        for record in contigs:
            r_annot = record.annotations
            logging.info("parsing contig: " + record.id)
            try:
                dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y"))
            except (TypeError, ValueError):
                pass
            genome['contig_ids'].append(record.id)
            genome['contig_lengths'].append(len(record))
            genome["publications"] |= self._get_pubs(r_annot)

            # only do the following once(on the first contig)
            if "source_id" not in genome:
                genome["source_id"] = record.id.split('.')[0]
                organism = r_annot.get('organism', 'Unknown Organism')
                if params.get('scientific_name'):
                    genome['scientific_name'] = params['scientific_name']
                else:
                    genome['scientific_name'] = organism
                self.code_table = genome['genetic_code']
                genome["molecule_type"] = r_annot.get('molecule_type', 'DNA')
                genome['notes'] = r_annot.get('comment',
                                              "").replace('\\n', '\n')

            self._parse_features(record, genome['source'])

        genome.update(self.get_feature_lists())

        genome['num_contigs'] = len(genome['contig_ids'])
        # add dates
        dates.sort()
        if dates:
            genome['external_source_origination_date'] = time.strftime(
                "%d-%b-%Y", dates[0])
            if dates[0] != dates[-1]:
                genome['external_source_origination_date'] += " _ " + \
                    time.strftime("%d-%b-%Y", dates[-1])

        if self.ontologies_present:
            genome['ontologies_present'] = dict(self.ontologies_present)
            genome["ontology_events"] = self.ontology_events
        genome['feature_counts'] = dict(self.feature_counts)
        # can't serialize a set
        genome['publications'] = list(genome['publications'])

        if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] /
                                    float(len(genome['cdss'])) > 0.02):
            self.genome_warnings.append(
                warnings["genome_inc_translation"].format(
                    self.defects['cds_seq_not_matching'], len(genome['cdss'])))
            self.genome_suspect = 1

        if self.defects['bad_parent_loc']:
            self.genome_warnings.append(
                f"There were {self.defects['bad_parent_loc']} parent/child "
                "relationships that were not able to be determined. Some of "
                "these may have splice variants that may be valid relationships."
            )

        if self.defects['spoofed_genes']:
            self.genome_warnings.append(warnings['spoofed_genome'].format(
                self.defects['spoofed_genes']))
            genome['suspect'] = 1

        if self.defects['not_trans_spliced']:
            self.genome_warnings.append(
                warnings['genome_not_trans_spliced'].format(
                    self.defects['not_trans_spliced']))
            genome['suspect'] = 1

        if self.genome_warnings:
            genome['warnings'] = self.genome_warnings
        if self.genome_suspect:
            genome['suspect'] = 1
        logging.info(f"Feature Counts: {genome['feature_counts']}")
        return genome

    def _save_assembly(self, genbank_file, params):
        """Convert genbank file to fasta and sve as assembly"""
        contigs = Bio.SeqIO.parse(genbank_file, "genbank")
        assembly_id = f"{params['genome_name']}_assembly"
        fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta"

        out_contigs = []
        extra_info = defaultdict(dict)
        for in_contig in contigs:
            if in_contig.annotations.get('topology', "") == 'circular':
                extra_info[in_contig.id]['is_circ'] = 1
                self.circ_contigs.add(in_contig.id)
            elif in_contig.annotations.get('topology', "") == 'linear':
                extra_info[in_contig.id]['is_circ'] = 0
            out_contigs.append(in_contig)
            self.contig_seq[in_contig.id] = in_contig.seq.upper()

        assembly_ref = params.get("use_existing_assembly")
        if assembly_ref:
            if not re.match("\d+\/\d+\/\d+", assembly_ref):
                raise ValueError(
                    f"Assembly ref: {assembly_ref} is not a valid format. Must"
                    f" be in numerical <ws>/<object>/<version> format.")
            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]
            if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]:
                raise ValueError(
                    f"{assembly_ref} is not a reference to an assembly")
            unmatched_ids = list()
            unmatched_ids_md5s = list()
            for current_contig in self.contig_seq.keys():
                current_contig_md5 = hashlib.md5(
                    str(self.contig_seq[current_contig]).encode(
                        'utf8')).hexdigest()
                if current_contig in ret['data']['contigs']:
                    if current_contig_md5 != ret['data']['contigs'][
                            current_contig]['md5']:
                        unmatched_ids_md5s.append(current_contig)
                else:
                    unmatched_ids.append(current_contig)
            if len(unmatched_ids) > 0:
                raise ValueError(warnings['assembly_ref_extra_contigs'].format(
                    ", ".join(unmatched_ids)))
            if len(unmatched_ids_md5s) > 0:
                raise ValueError(warnings["assembly_ref_diff_seq"].format(
                    ", ".join(unmatched_ids_md5s)))
            logging.info(f"Using supplied assembly: {assembly_ref}")
            return assembly_ref
        logging.info("Saving sequence as Assembly object")
        Bio.SeqIO.write(out_contigs, fasta_file, "fasta")
        assembly_ref = self.aUtil.save_assembly_from_fasta({
            'file': {
                'path': fasta_file
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            assembly_id,
            'type':
            params.get('genome_type', 'isolate'),
            'contig_info':
            extra_info
        })
        logging.info(f"Assembly saved to {assembly_ref}")
        return assembly_ref

    def _find_input_files(self, input_directory):
        logging.info("Scanning for Genbank Format files.")
        valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"]

        files = os.listdir(os.path.abspath(input_directory))
        logging.info("Genbank Files : " + ", ".join(files))
        genbank_files = [
            x for x in files
            if os.path.splitext(x)[-1].lower() in valid_extensions
        ]

        if len(genbank_files) == 0:
            raise Exception(
                f"The input directory does not have any files with one of the "
                f"following extensions {','.join(valid_extensions)}.")

        logging.info(f"Found {len(genbank_files)} genbank files")

        input_files = []
        for genbank_file in genbank_files:
            input_files.append(os.path.join(input_directory, genbank_file))

        return input_files

    def _join_files_skip_empty_lines(self, input_files):
        """ Applies strip to each line of each input file.
            Args:
                input_files: Paths to input files in Genbank format.
            Returns:
                Path to resulting file (currenly it's the same file as input).
            """
        if len(input_files) == 0:
            raise ValueError("NO GENBANK FILE")
        temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined")
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        ret_file = os.path.join(temp_dir, os.path.basename(input_files[0]))

        # take in Genbank file and remove all empty lines from it.
        with open(ret_file, 'w', buffering=2**20) as f_out:
            for input_file in input_files:
                with open(input_file, 'r') as f_in:
                    for line in f_in:
                        line = line.rstrip('\r\n')
                        if line.strip():
                            f_out.write(line + '\n')
        return ret_file

    def _get_pubs(self, r_annotations):
        """Get a contig's publications"""
        pub_list = []
        for in_pub in r_annotations.get('references', []):
            # don't add blank pubs
            if not in_pub.authors:
                continue
            out_pub = [
                0,  # pmid
                "",  # source
                in_pub.title,
                "",  # web address
                "",  # date
                in_pub.authors,
                in_pub.journal,
            ]
            date_match = re.match("\((\d{4})\)", in_pub.journal)
            if date_match:
                out_pub[4] = date_match.group(1)
            if in_pub.pubmed_id:
                out_pub[0:4] = [
                    int(in_pub.pubmed_id), "PubMed", in_pub.title,
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}"
                ]
            pub_list.append(tuple(out_pub))
        logging.info(f"Parsed {len(pub_list)} publication records")
        return set(pub_list)

    def _get_id(self, feat, tags=None):
        """Assign a id to a feature based on the first tag that exists"""
        _id = ""
        if not tags:
            tags = ['locus_tag', 'kbase_id']
        for t in tags:
            _id = feat.qualifiers.get(t, [""])[0]
            if _id:
                break

        if not _id:
            if feat.type == 'gene':
                if not self.generate_ids:
                    raise ValueError(
                        f"Unable to find a valid id for gene "
                        f"among these tags: {', '.join(tags)}. Correct the "
                        f"file or rerun with generate_ids\n {feat}")
                self.orphan_types['gene'] += 1
                _id = f"gene_{self.orphan_types['gene']}"
            if 'rna' in feat.type.lower() or feat.type in {
                    'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR'
            }:
                _id = f"gene_{self.orphan_types['gene']}"

        return _id

    def _parse_features(self, record, source):
        def _location(feat):
            """Convert to KBase style location objects"""
            strand_trans = ("", "+", "-")
            loc = []
            for part in feat.location.parts:
                contig_id = part.ref if part.ref else record.id
                if part.strand >= 0:
                    begin = int(part.start) + 1
                else:
                    begin = int(part.end)
                loc.append(
                    (contig_id, begin, strand_trans[part.strand], len(part)))
            return loc

        def _warn(message):
            if message not in out_feat.get('warnings', []):
                out_feat['warnings'] = out_feat.get('warnings', []) + [message]

        def _check_suspect_location(parent=None):
            if 'trans_splicing' in out_feat.get('flags', []):
                return

            if out_feat['location'] == sorted(
                    out_feat['location'],
                    reverse=(in_feature.location.strand == -1)):
                return

            if record.id in self.circ_contigs and \
                    in_feature.location.start == 0 \
                    and in_feature.location.end == len(record):
                self.features_spaning_zero.add(out_feat['id'])
                return

            if parent and parent['id'] in self.features_spaning_zero:
                return

            _warn(warnings['not_trans_spliced'])
            self.defects['not_trans_spliced'] += 1

        for in_feature in record.features:
            if in_feature.type in self.excluded_features:
                self.skiped_features[in_feature.type] += 1
                continue
            feat_seq = self._get_seq(in_feature, record.id)
            if source == "Ensembl":
                _id = self._get_id(in_feature, ['gene', 'locus_tag'])
            else:
                _id = self._get_id(in_feature)

            # The following is common to all the feature types
            out_feat = {
                "id": "_".join([_id, in_feature.type]),
                "location": _location(in_feature),
                "dna_sequence": str(feat_seq),
                "dna_sequence_length": len(feat_seq),
                "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            }
            if not _id:
                out_feat['id'] = in_feature.type

            # validate input feature
            # note that end is the larger number regardless of strand
            if int(in_feature.location.end) > len(record):
                self.genome_warnings.append(
                    warnings["coordinates_off_end"].format(out_feat['id']))
                self.genome_suspect = 1
                continue

            for piece in in_feature.location.parts:
                if not isinstance(piece.start, ExactPosition) \
                        or not isinstance(piece.end, ExactPosition):
                    _warn(warnings["non_exact_coordinates"])

            self.feature_counts[in_feature.type] += 1

            # add optional fields
            if 'note' in in_feature.qualifiers:
                out_feat['note'] = in_feature.qualifiers["note"][0]

            out_feat.update(self._get_aliases_flags_functions(in_feature))

            ont, db_xrefs = self._get_ontology_db_xrefs(in_feature)
            if ont:
                out_feat['ontology_terms'] = ont
            if db_xrefs:
                out_feat['db_xrefs'] = db_xrefs

            if 'inference' in in_feature.qualifiers:
                out_feat['inference_data'] = parse_inferences(
                    in_feature.qualifiers['inference'])

            _check_suspect_location(self.genes.get(_id))

            # add type specific features
            if in_feature.type == 'CDS':
                self.process_cds(_id, feat_seq, in_feature, out_feat)

            elif in_feature.type == 'gene':
                self.process_gene(_id, out_feat)

            elif in_feature.type == 'mRNA':
                self.process_mrna(_id, out_feat)

            else:
                self.noncoding.append(
                    self.process_noncoding(_id, in_feature.type, out_feat))

    def get_feature_lists(self):
        """sort genes into their final arrays"""
        coding = []
        for g in self.genes.values():
            if len(g['cdss']):
                if g['mrnas'] and len(g['mrnas']) != len(g['cdss']):
                    msg = "The length of the mrna and cdss arrays are not equal"
                    g['warnings'] = g.get('warnings', []) + [msg]

                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in g:
                        g[key] = list(set(g[key]))
                if not g['mrnas']:
                    del g['mrnas']
                del g['type']
                coding.append(g)
                self.feature_counts["protein_encoding_gene"] += 1
            else:
                del g['mrnas'], g['cdss']
                self.noncoding.append(g)
                self.feature_counts["non_coding_genes"] += 1

        self.feature_counts["non_coding_features"] = len(self.noncoding)
        return {
            'features': coding,
            'non_coding_features': self.noncoding,
            'cdss': list(self.cdss.values()),
            'mrnas': list(self.mrnas.values())
        }

    def _get_seq(self, feat, contig):
        """Extract the DNA sequence for a feature"""
        seq = []
        for part in feat.location.parts:
            strand = part.strand
            # handle trans-splicing across contigs
            if part.ref:
                part_contig = part.ref
            else:
                part_contig = contig

            if strand >= 0:
                seq.append(
                    str(self.contig_seq[part_contig][part.start:part.end]))
            else:
                seq.append(
                    str(self.contig_seq[part_contig]
                        [part.start:part.end].reverse_complement()))
        return "".join(seq)

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(f"{ontology_type} is not a supported ontology")

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = defaultdict(dict)
        db_xrefs = []
        for key in ("GO_process", "GO_function", "GO_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.qualifiers.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        for ref in feature.qualifiers.get('db_xref', []):
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            elif ":" not in ref:
                db_xrefs.append(tuple(["Unknown_Source", ref]))
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))

        return dict(ontology), sorted(db_xrefs)

    @staticmethod
    def _get_aliases_flags_functions(feat):
        """Get the values for aliases flags and features from qualifiers"""
        alias_keys = {
            'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id',
            'gene', 'EC_number', 'gene_synonym'
        }
        result = defaultdict(list)
        for key, val_list in feat.qualifiers.items():
            if key in alias_keys:
                result['aliases'].extend([(key, val) for val in val_list])
            # flags have no other information associated with them
            if val_list == ['']:
                result['flags'].append(key)
            if key == 'function':
                result['functional_descriptions'].extend(
                    val_list[0].split('; '))
            if key == 'product':
                result['functions'] = val_list

        return result

    def _find_parent_gene(self, potential_id, feature):
        """Unfortunately, Genbank files don't have a parent ID and the features can be out of
        order at times. To account for this, the this function works backwards from the end of
        list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum
        number of tries"""
        if potential_id in self.genes:
            lookup_attempts = 0
            while lookup_attempts < MAX_PARENT_LOOKUPS:
                if is_parent(self.genes[potential_id], feature):
                    return potential_id

                lookup_attempts += 1
                try:
                    potential_id = list(
                        self.genes.keys())[-(lookup_attempts + 1)]
                except IndexError:
                    break  # no more genes that could match exist

            self.defects['bad_parent_loc'] += 1
        return None

    def assign_new_id(self, _id):
        """given a feature id that has already been used, add a unique modifier to it"""
        _id_modifier = self.used_twice_identifiers.get(_id, 1)
        self.used_twice_identifiers[_id] = _id_modifier + 1
        return _id + "." + str(_id_modifier)

    def process_gene(self, _id, out_feat):
        out_feat.update({
            "id": _id,
            "type": 'gene',
            "mrnas": [],
            'cdss': [],
        })
        if _id in self.genes:
            _id = self.assign_new_id(_id)
            out_feat.update({"id": _id})
            # raise ValueError(f"Duplicate gene ID: {_id}")
        self.genes[_id] = out_feat

    def process_noncoding(self, gene_id, feat_type, out_feat):
        out_feat["type"] = feat_type

        # this prevents big misc_features from blowing up the genome size
        if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
            del out_feat['dna_sequence']

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            if 'children' not in self.genes[gene_id]:
                self.genes[gene_id]['children'] = []
            out_feat['id'] += "_" + str(
                len(self.genes[gene_id]['children']) + 1)
            self.genes[gene_id]['children'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types[feat_type] += 1
            out_feat['id'] += "_" + str(self.orphan_types[feat_type])

        return out_feat

    def process_mrna(self, gene_id, out_feat):
        if gene_id not in self.genes and self.generate_parents:
            self.process_gene(gene_id, copy.copy(out_feat))

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            out_feat['id'] = "_".join(
                (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1)))
            self.genes[gene_id]['mrnas'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['mrna'] += 1
            out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}"
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                'Unable to find parent gene for ' + str(out_feat['id'])
            ]

        self.mrnas[out_feat['id']] = out_feat

    def process_cds(self, gene_id, feat_seq, in_feature, out_feat):
        # Associate CDS with parents
        cds_warnings = out_feat.get('warnings', [])
        validated_gene_id = self._find_parent_gene(gene_id, out_feat)
        if validated_gene_id:
            out_feat['id'] = "_".join(
                (validated_gene_id, "CDS",
                 str(len(self.genes[validated_gene_id]['cdss']) + 1)))
            self.genes[validated_gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = validated_gene_id
        elif self.generate_parents and gene_id not in self.genes:
            new_feat = copy.copy(out_feat)
            new_feat['id'] = gene_id
            new_feat['warnings'] = [warnings['spoofed_gene']]
            self.orphan_types['gene'] += 1
            self.defects['spoofed_genes'] += 1
            self.process_gene(new_feat['id'], new_feat)

            out_feat['id'] = "_".join(
                (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1)))
            self.genes[gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['cds'] += 1
            out_feat['id'] = f"CDS_{self.orphan_types['cds']}"
            cds_warnings.append(
                f"Unable to find parent gene for {out_feat['id']}")

        # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1
        mrna_id = out_feat["id"].replace('CDS', 'mRNA')
        if mrna_id in self.mrnas:
            if not is_parent(self.mrnas[mrna_id], out_feat):
                cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id))
                self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get(
                    'warnings', []) + [warnings['cds_mrna_mrna']]
                self.defects['bad_parent_loc'] += 1
            else:
                out_feat['parent_mrna'] = mrna_id
                self.mrnas[mrna_id]['cds'] = out_feat['id']

        # process protein
        prot_seq = in_feature.qualifiers.get("translation", [""])[0]

        # allow a little slack to account for frameshift and stop codon
        if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4:
            cds_warnings.append(warnings["inconsistent_CDS_length"].format(
                len(feat_seq), len(prot_seq)))
            self.genome_warnings.append(
                warnings['genome_inc_CDS_length'].format(
                    out_feat['id'], len(feat_seq), len(prot_seq)))
            self.genome_suspect = 1

        try:
            if prot_seq and prot_seq != Seq.translate(
                    feat_seq, self.code_table, cds=True).strip("*"):
                cds_warnings.append(warnings["inconsistent_translation"])
                self.defects['cds_seq_not_matching'] += 1

        except TranslationError as e:
            cds_warnings.append("Unable to verify protein sequence:" + str(e))

        if not prot_seq:
            try:
                prot_seq = Seq.translate(feat_seq, self.code_table,
                                         cds=True).strip("*")
                cds_warnings.append(warnings["no_translation_supplied"])

            except TranslationError as e:
                cds_warnings.append(warnings["no_translation_supplied"] +
                                    str(e))

        out_feat.update({
            "protein_translation":
            prot_seq,
            "protein_md5":
            hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
            "protein_translation_length":
            len(prot_seq),
        })

        if out_feat.get('parent_gene'):
            propagate_cds_props_to_gene(out_feat,
                                        self.genes[out_feat['parent_gene']])

        if cds_warnings:
            out_feat['warnings'] = cds_warnings

        self.cdss[out_feat['id']] = out_feat
예제 #5
0
    def download_long(self, console, warnings, token, wsname, lib,
                      min_long_read_length):
        try:
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)}
            lib_obj_info = wsClient.get_object_info_new({'objects':
                                                         [obj_id]})[0]
            lib_obj_type = lib_obj_info[TYPE_I]
            lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                  lib_obj_type)  # remove trailing version
            lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
            if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly':
                # download using assembly util / data file util
                self.log(console,
                         "Getting long reads (from contigs object).\n")
                auClient = AssemblyUtil(url=self.callbackURL, token=token)
                dfuClient = DataFileUtil(url=self.callbackURL, token=token)
                contigFile = auClient.get_assembly_as_fasta({
                    'ref': lib_ref
                }).get('path')
                long_reads_path = dfuClient.unpack_file(
                    {'file_path': contig_file})['file_path']
                self.log(
                    warnings,
                    "Warning:  Long reads are in FASTA format, so short read check was not performed."
                )

            else:
                ruClient = ReadsUtils(url=self.callbackURL, token=token)
                self.log(console,
                         "Getting long reads (from reads library object).\n")
                result = ruClient.download_reads({
                    'read_libraries': [lib_ref],
                    'interleaved': 'false'
                })
                long_reads_path = result['files'][lib_ref]['files']['fwd']
                [n_reads, n_reads_short
                 ] = self.filter_short_fastq(console, long_reads_path,
                                             min_long_read_length)
                if (n_reads_short > 0):
                    self.log(
                        warnings, "Warning:  Of " + str(n_reads) +
                        " long reads, " + str(n_reads_short) +
                        " are shorter than " + str(min_long_read_length) +
                        "; consider using the filtlong app to filter out shorter reads."
                    )

        except Exception as e:
            raise ValueError('Unable to download long reads\n' + str(e))
        return long_reads_path
예제 #6
0
class VCFToVariation:
    def __init__(self, config, scratch, callback_url ):
        self.scratch = config['scratch']
        self.ws_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)
        self.scratch = scratch
        self.callback_url = callback_url
        self.au = AssemblyUtil(self.callback_url)
        self.gapi = GenericsAPI(self.callback_url)


    def _parse_vcf_data(self, params):
        vcf_filepath = self._stage_input(params)

        # file is validated by this point, can assume vcf_filepath is valid
        reader = vcf.Reader(open(vcf_filepath, 'r'))

        version = float(reader.metadata['fileformat'][4:6])
        genotypes = reader.samples
        chromosomes = []
        contigs = {}
        totalvars = 0

        for record in reader:
            totalvars += 1
            if record.CHROM not in chromosomes:
                chromosomes.append(record.CHROM)

            if record.CHROM not in contigs.keys():
                passvar = 1 if not record.FILTER else 0

                contigs[record.CHROM] = {
                    'contig_id': record.CHROM,
                    'totalvariants': 1,
                    'passvariants': passvar,
                    'length': int(record.affected_end-record.affected_start),
                }
            else:
                contigs[record.CHROM]['totalvariants'] += 1
                if not record.FILTER:
                    contigs[record.CHROM]['passvariants'] += 1

        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'file_ref': vcf_filepath
        }

        return vcf_info


    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []

        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes):
        chromos_not_in_assembly = []

        pp(assembly_chromosomes)

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _get_vcf_version(self, vcf_filepath):
        with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted.")
                raise ValueError("Invalid VCF.  ##fileformat line in meta is improperly formatted. "
                                 "Check VCF file specifications: https://samtools.github.io/hts-specs/")

            vcf_version = float(tokens[1][-4:].rstrip())

            return vcf_version

    def validate_vcf(self, params):
        if 'genome_or_assembly_ref' not in params:
            raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params)
        if 'vcf_staging_file_path' not in params:
            raise ValueError('VCF staging file path not in input parameters: \n\n' + params)


        vcf_filepath = self._stage_input(params)

        vcf_version = self._get_vcf_version(vcf_filepath)

        # setup directorys for validation output
        validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        # vcftools (vcf-validator) supports VCF v4.0-4.2
        # https://github.com/vcftools/vcftools

        # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3
        # https://github.com/EBIvariation/vcf-validator

        # vcftools is only to validate VCF v4.0

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-l")
            validator_cmd.append('error')
            print("VCF version "+str(vcf_version)+".")
        elif vcf_version >= 4.0:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version 4.0.")
        else:
            raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the '
                             'first line of vcf file and in appropriate syntax. Check VCF file specifications: '
                             'https://samtools.github.io/hts-specs/')

        print("Validator command: {}".format(validator_cmd))

        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if line.decode("utf-8").strip().startswith('[info]'):
                validator_output.append(line.decode("utf-8"))

        out, err = p.communicate()

        validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt')
        file_output_chk = []

        try:
            if validator_output[0][:6] == '[info]':
                # validation by vcf_validator_linux
                validation_output_filename = validator_output[1].split(' ')[6].strip('\n')
                vo = validator_output[2].split(' ')
                file_output_chk = ''.join(vo[9:]).strip('\n')

                if not os.path.exists(validation_output_filename):
                    raise ValueError(validation_output_filename+' does not exist!')

                if not file_output_chk == 'isvalid':
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))

                #TODO: more detailed validation parsing for vcf_validator_linux
            else:
                if validator_output:
                    with open(validation_output_filename, 'w') as f:
                        for line in validator_output:
                            f.write(str(line))
                        f.close()
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))
                else:
                    with open(validation_output_filename, 'w') as f:
                        f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0")
                        f.close()

                # TODO: more detailed validation parsing for vcftools
        except IndexError:
            # if vcf file < v4.1, and valid it will produce index error on line 132
            if validator_output:
                with open(validation_output_filename, 'w') as f:
                    for line in validator_output:
                        f.write(str(line))
                    f.close()
                print('\n'.join(validator_output))
                raise ValueError('\n'.join(validator_output))
            else:
                with open(validation_output_filename, 'w') as f:
                    f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0")
                    f.close()

        if not os.path.exists(validation_output_filename):
            print('Validator did not generate log file!')
            raise SystemError("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filename))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filename

    def _stage_input(self, params):
        # extract file location from input ui parameters
        if params['vcf_staging_file_path'].startswith('/kb/module/test/'):
            # variation utils unit test
            vcf_local_file_path = params['vcf_staging_file_path']

            if vcf_local_file_path.endswith('.gz'):
                with gzip.open(vcf_local_file_path, 'rb') as f_in:
                    with open(vcf_local_file_path[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                vcf_local_file_path = vcf_local_file_path[:-3]
        else:
            staging_dir = '/staging'
            vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path'])

        if not os.path.exists(vcf_local_file_path):
            raise OSError('VCF input path does not exist, or is not readable')

        orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path))
        print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}')
        self.original_file = shutil.copy(vcf_local_file_path, orig_file_path)

        # TODO: use data file utils here, upload vcf to shock, use dfu.
        if is_gz_file(vcf_local_file_path):
            # /staging is read only, therefore have to copy before uncompressing
            if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']):
                copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path']))
                unpack = self.dfu.unpack_file({'file_path': copy})
            else:
                unpack = {}
                unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path'])
            params['vcf_local_file_path'] = unpack['file_path']
            return unpack['file_path']
        else:
            params['vcf_local_file_path'] = vcf_local_file_path 
            return vcf_local_file_path

    def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file):
        """
        function for creating sample attribute mapping file.
        """
        try:
            with open (vcf_file, 'r') as vcf_handle:
                Lines = vcf_handle.readlines()

                for line in Lines:
                    if(line.startswith("#CHROM")):
                       header = line.lstrip().split("\t")

                       try:
                          with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle:
                              attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID")

                              for i in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[i])
                              #attribute_mapping_handle.write("\n")


                              attribute_mapping_handle.write("label\t\t\t")
                              for j in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[j])
                              #attribute_mapping_handle.write("\n")
                       except IOError:
                           print("Could not write to file:", sample_attribute_mapping_file)

        except IOError:
               print("Could not read file:", vcf_file)

    def _validate_assembly_ids(self, params):
        # All chromosome ids from the vcf should be in assembly
        # but not all assembly chromosome ids should be in vcf


        if ('genome_ref' in params):
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': params['genome_or_assembly_ref']
            }])

            self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref']

        if ('assembly_ref' in params):
            self.vcf_info['assembly_ref'] = params['assembly_ref']

        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])

        assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys()
        vcf_chromosomes = self.vcf_info['chromosome_ids']

        chk_assembly_ids =  self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.')


        return assembly_chromosomes

    def _validate_sample_ids(self, params):
        # All samples within the VCF file need to be in sample attribute list


        vcf_genotypes = self.vcf_info['genotype_ids']

        sample_ids_subset = self.wsc.get_object_subset([{
            'included': ['/instances'],
            'ref': params['sample_attribute_ref']
        }])

        sample_ids = sample_ids_subset[0]['data']['instances'].keys()

        validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids)

        if isinstance(validate_genotypes, list):
            failed_genos = ' '.join(validate_genotypes)
            print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')
            raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')

        return sample_ids

    def _construct_contig_info(self, params):
        """
            KBaseGwasData.Variations type spec

            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 passvariants - total number of variants that pass quality variation filter in contig
                 length - length of contig from assembly data
             */

             typdef structure {
               string contig_id;
               int totalvariants;
               int passvariants;
               int length; // from assembly
             } contig_info;
        """

        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])[0]['data']['contigs']


        contigs = []

        contig_infos = self.vcf_info['contigs']


        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs
   

    def _bgzip_vcf(self, vcf_filepath):

        if not os.path.exists(vcf_filepath):
           print (vcf_filepath + " does not exist")

        zip_cmd = ["bgzip", vcf_filepath]
        
        p = subprocess.Popen(zip_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()        
        
        bgzip_file_path = vcf_filepath + ".gz"
        print (bgzip_file_path)
          
        return bgzip_file_path
  
 
    def _index_vcf(self, bgzip_file):
 
        output_dir = self.scratch

        bgzip_filepath = os.path.join(self.scratch, bgzip_file)
        if not os.path.exists(bgzip_filepath):
           print (bgzip_filepath + " does not exist")

        index_cmd = ["tabix", "-p", "vcf", bgzip_filepath]       
        p = subprocess.Popen(index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
         
        index_file_path = bgzip_filepath + ".tbi"
     
        return index_file_path

    def _index_assembly(self, assembly_file):
        if not os.path.exists(assembly_file):
           print (assembly_file + " does not exist")

        logging.info("indexing assembly file")

        assembly_index_cmd = ["samtools", "faidx", assembly_file]
        print(assembly_index_cmd)
        p = subprocess.Popen(assembly_index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()

        logging.info("indexing of assembly file done!")

        return assembly_file + ".fai"

    def _download_assembly(self, assembly_ref):
        file = self.au.get_assembly_as_fasta({
          'ref': assembly_ref
        })
        return file
 
    def _construct_variation(self, params, contigs_info):
        
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref population; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :param population: previoiusly constructed sample population data
            :return: constructed variation object (dictionary)
        """

        if not self.vcf_info['file_ref'].startswith(self.scratch):
            new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref']))
            self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file)
      

        vcf_staged_file = self.original_file

        bgzip_file_path = self._bgzip_vcf(vcf_staged_file)
        vcf_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': bgzip_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)


        index_file_path = self._index_vcf(bgzip_file_path)
        vcf_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)


        assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path']

        assembly_index_file_path = self._index_assembly(assembly_file_path)
        assembly_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': assembly_index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref)
        
        variation_obj = {
            'numgenotypes': int(len(self.vcf_info['genotype_ids'])),
            'numvariants': int(self.vcf_info['total_variants']),
            'contigs': contigs_info,
            'population': params['sample_attribute_ref'],

            # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref
            'assemby_ref': self.vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle' : vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
            'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'],
            'assembly_index_handle': assembly_index_shock_file_ref['handle']
        }
        if 'genome_ref' in params:
            variation_obj['genome_ref'] =  params['genome_ref']

        return variation_obj

    def _save_var_obj(self, params, var):
        """
        :param params:
        :param var:
        :return:
            DataFileUtils object_info:
                objid - the numerical id of the object.
                name - the name of the object.
                type - the type of the object.
                save_date - the save date of the object.
                ver - the version of the object.
                saved_by - the user that saved or copied the object.
                wsid - the id of the workspace containing the object.
                workspace - the name of the workspace containing the object.
                chsum - the md5 checksum of the object.
                size - the size of the object in bytes.
                meta - arbitrary user-supplied metadata about the object.
        """

        print('Saving Variation to workspace...\n')

        if var:
            if not 'variation_object_name' in params:
                var_obj_name = 'variation_'+str(uuid.uuid4())
            else:
                var_obj_name = params['variation_object_name']

            var_obj_info = self.dfu.save_objects({
                'id': self.dfu.ws_name_to_id(params['workspace_name']),
                'objects': [{
                    'type': 'KBaseGwasData.Variations',
                    'data': var,
                    'name': var_obj_name
                }]
            })[0]

            return var_obj_info
        else:
            raise ValueError('Variation object blank, cannot not save to workspace!')

    def _validate_sample_attribute_ref(self, params):

        #params["sample_attribute_ref"] = ''  #just for testing
        if not params['sample_attribute_ref']:
           sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv")   #hardcoded for testing
           self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file)
          
           logging.info("Uploading sample attribute file to ref")
           vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock(
               {'file_path': sample_attribute_mapping_file, 'make_handle': 1}
           )
           shock_id = vcf_sample_attribute_shock_file_ref['shock_id']
           ws_id = self.dfu.ws_name_to_id(params['workspace_name'])
           import_params = {
                  'input_shock_id' : shock_id,
                  'output_ws_id': ws_id,
                  'output_obj_name': 'Sample_attribute'}

           ret = self.gapi.file_to_attribute_mapping(import_params)
           params['sample_attribute_ref'] = ret['attribute_mapping_ref']

    def import_vcf(self, params):
        # VCF validation
        # VCF file validation
        file_valid_result = self.validate_vcf(params)
        self._validate_sample_attribute_ref(params)
        # VCF file parsing
        self.vcf_info = self._parse_vcf_data(params)
        # Validate vcf chromosome ids against assembly chromosome ids
        self._validate_assembly_ids(params)
        # Validate vcf genotypes against sample meta data ids
        self._validate_sample_ids(params)

        # Variation object construction
        # construct contigs_info
        contigs_info = self._construct_contig_info(params)
        # construct variation
        var = self._construct_variation(params, contigs_info)

        # Save variation object to workspace
        var_wksp_obj = self._save_var_obj(params, var)

        return [var_wksp_obj, var]
예제 #7
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.au = AssemblyUtil(config.callbackURL)
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(self.cfg)
        self.taxon_wsname = self.cfg.raw['taxon-workspace-name']
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR',
                           'start_codon', 'stop_codon', 'region', 'chromosome',
                           'scaffold')
        self.spoof_gene_count = 0
        self.is_phytozome = False
        self.strict = True
        self.generate_genes = False
        self.warnings = []
        self.feature_dict = collections.OrderedDict()
        self.cdss = set()
        self.ontologies_present = collections.defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = collections.Counter()
        self.feature_counts = collections.Counter()

    def warn(self, message):
        self.warnings.append(message)

    def generate_genome_json(self, params):
        # 1) validate parameters
        self._validate_import_file_params(params)
        self.code_table = params.get('genetic_code', 11)
        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)
        # 3) extract out the parameters
        params = self._set_parsed_params(params)
        if params.get('generate_missing_genes'):
            self.generate_genes = True

        # 4) do the upload
        genome = self._gen_genome_json(
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            source=params['source'],
            source_id=params['source_id'],
            release=params['release'],
        )
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        return genome, input_directory

    def import_file(self, params):

        genome, input_directory = self.generate_genome_json(params)

        json.dump(genome,
                  open(
                      "{}/{}.json".format(self.cfg.sharedFolder, genome['id']),
                      'w'),
                  indent=4)
        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params.get('metadata', {}),
        })
        report_string = 'A genome with {} contigs and the following feature ' \
                        'types was imported: {}'\
            .format(len(genome['contig_ids']), "\n".join(
                [k + ": " + str(v) for k, v in genome['feature_counts'].items()]))
        log(report_string)

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info
        }

        return details

    def _gen_genome_json(self,
                         input_gff_file=None,
                         input_fasta_file=None,
                         workspace_name=None,
                         core_genome_name=None,
                         scientific_name="unknown_taxon",
                         source=None,
                         source_id=None,
                         release=None):

        # reading in GFF file
        features_by_contig = self._retrieve_gff_file(input_gff_file)
        contig_ids = set()

        # parse feature information
        fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta")
        for contig in fasta_contigs:
            molecule_type = str(contig.seq.alphabet).replace(
                'IUPACAmbiguous', '').strip('()')
            contig_ids.add(contig.id)
            for feature in features_by_contig.get(contig.id, []):
                self._transform_feature(contig, feature)

        for cid in set(features_by_contig.keys()) - contig_ids:
            self.warn("Sequence name {} does not match a sequence id in the "
                      "FASTA file. {} features will not be imported.".format(
                          cid, len(features_by_contig[cid])))
            if self.strict:
                raise ValueError(
                    "Every feature sequence id must match a fasta sequence id")
        self._process_cdss()

        # save assembly file
        assembly_ref = self.au.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            core_genome_name + ".assembly"
        })
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, source, source_id,
                                       assembly_data, input_gff_file,
                                       molecule_type)
        genome['release'] = release
        if self.spoof_gene_count > 0:
            genome['warnings'] = genome.get('warnings', []) + \
                                    [warnings['spoofed_genome'].format(self.spoof_gene_count)]
            genome['suspect'] = 1

        return genome

    @staticmethod
    def _location(in_feature):
        in_feature['strand'] = in_feature['strand'].replace(
            "-1", "-").translate(strand_table)
        if in_feature['strand'] == '+':
            start = in_feature['start']
        elif in_feature['strand'] == '-':
            start = in_feature['end']
        else:
            raise ValueError('Invalid feature strand: {}'.format(
                in_feature['strand']))
        return [
            in_feature['contig'], start, in_feature['strand'],
            in_feature['end'] - in_feature['start'] + 1
        ]

    @staticmethod
    def _validate_import_file_params(params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(list(file.keys()))
                raise ValueError(error_msg)
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(
                    "Invalid genetic code specified: {}".format(params))

    def _set_parsed_params(self, params):
        log('Setting params')

        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'metadata': {},
            'source_id': 'unknown',
        }
        default_params.update(params)
        log(json.dumps(default_params, indent=1))
        return default_params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                log("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = collections.defaultdict(list)
        is_patric = 0

        gff_file_handle = open(input_gff_file)
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if "phytozome" in source_id.lower():
                    self.is_phytozome = True

                #Checking to see if Phytozome
                if "PATRIC" in source_id:
                    is_patric = True

                #PATRIC prepends their contig ids with some gibberish
                if is_patric and "|" in contig_id:
                    contig_id = contig_id.split("|", 1)[1]

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': collections.defaultdict(list)
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if not attribute:
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                        ftr['attributes'][key.lower()].append(
                            parse.unquote(value.strip('"')))
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                        ftr['attributes'][key.lower()].append(
                            parse.unquote(value.strip('"')))
                    else:
                        pass
                        #log("Warning: attribute "+attribute+" cannot be separated into key,value pair")

                ftr['attributes']['raw'] = attributes
                if "id" in ftr['attributes']:
                    ftr['ID'] = ftr['attributes']['id'][0]
                if "parent" in ftr['attributes']:
                    ftr['Parent'] = ftr['attributes']['parent'][0]

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        #feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if self.is_phytozome:
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):
        log("Adding missing identifiers")
        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list:
            for i, feat in enumerate(feature_list[contig]):
                if "ID" not in feature_list[contig][i]:
                    for key in ("transcriptid", "proteinid", "pacid", "parent",
                                "name", 'transcript_id'):
                        if key in feature_list[contig][i]['attributes']:
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i]['attributes'][key][0]
                            break
                    if feat['type'] not in self.skip_types:
                        self.feature_counts[feat['type']] += 1

                    #If the process fails, throw an error
                    if "ID" not in feature_list[contig][i]:
                        feat['ID'] = "{}_{}".format(
                            feat['type'], self.feature_counts[feat['type']])
                        #log("Warning: Could find unique ID to utilize in GFF attributes: {}. "
                        #    "ID '{}' has been assigned".format(feat['attributes'], feat['ID']))
        return feature_list

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list:
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ftrs[i]["type"] in self.skip_types:
                    continue
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    @staticmethod
    def _update_phytozome_features(feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list:
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("id", "pacid"):
                    if (key in feature_list[contig][i]['attributes']):
                        old_id = feature_list[contig][i]['attributes'][key][0]
                        break
                if (old_id is None):
                    #This should be an error
                    #log("Cannot find unique ID, PACid, or pacid in GFF "
                    #    "attributes: " + feature_list[contig][i][contig])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                # Clip off the increment on CDS IDs so fragments of the same
                # CDS share the same ID
                if "CDS" in feature_list[contig][i]["ID"]:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "ID"].rsplit('.', 1)[0]

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("name" in feature_list[contig][i]['attributes']):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        'attributes']['name'][0]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended

        mRNA_parent_dict = dict()

        for contig in feature_list:
            for ftr in feature_list[contig]:
                if ftr["type"] in self.skip_types:
                    continue
                if ("Parent" in ftr):
                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

        return feature_list

    def _check_location_order(self, locations):
        """If order looks good return None.  
           If out of order return warning
           If on multiple strands return warning"""
        strand = None
        last_start = 0
        for location in locations:
            if strand == None:
                strand = location[2]
            elif strand != location[2]:
                return warnings["both_strand_coordinates"]
        if strand == "-":
            locations = reversed(locations)
        for location in locations:
            if last_start > location[1]:
                return warnings["out_of_order"]
            else:
                last_start = location[1]
        return None

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(
                "{} is not a supported ontology".format(ontology_type))

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = collections.defaultdict(dict)
        db_xrefs = []
        # these are keys are formatted strangely and require special parsing
        for key in ("go_process", "go_function", "go_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        # CATH terms are not distinct from EC numbers so myst be found by key
        for term in feature.get('cath_funfam', []) + feature.get('cath', []):
            for ref in term.split(','):
                ontology['CATH'][ref] = [self._create_ontology_event("CATH")]
                self.ontologies_present['CATH'][ref] = self.ont_mappings[
                    'CATH'].get(ref, '')

        search_keys = [
            'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam',
            'pfam', 'cog', 'go', 'po', 'ko'
        ]
        ont_terms = []
        # flatten out into list of values
        for key in search_keys:
            if key in feature:
                ont_terms += [x for y in feature[key] for x in y.split(',')]

        for ref in ont_terms:
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))
        return dict(ontology), db_xrefs

    def _transform_feature(self, contig, in_feature):
        """Converts a feature from the gff ftr format into the appropriate
        format for a genome object """
        def _aliases(feat):
            keys = ('locus_tag', 'old_locus_tag', 'protein_id',
                    'transcript_id', 'gene', 'ec_number', 'gene_synonym')
            alias_list = []
            for key in keys:
                if key in feat['attributes']:
                    alias_list.extend([(key, val)
                                       for val in feat['attributes'][key]])
            return alias_list

        if in_feature['start'] < 1 or in_feature['end'] > len(contig):
            self.warn("Feature with invalid location for specified "
                      "contig: " + str(in_feature))
            if self.strict:
                raise ValueError(
                    "Features must be completely contained within the Contig in the "
                    "Fasta file. Feature: " + str(in_feature))
            return

        feat_seq = contig.seq[in_feature['start'] -
                              1:in_feature['end']].upper()
        if in_feature['strand'] in {'-', '-1'}:
            feat_seq = feat_seq.reverse_complement()

        # if the feature ID is duplicated (CDS or transpliced gene) we only
        # need to update the location and dna_sequence
        if in_feature.get('ID') in self.feature_dict:
            existing = self.feature_dict[in_feature['ID']]
            existing['location'].append(self._location(in_feature))
            existing['dna_sequence'] = existing.get('dna_sequence',
                                                    '') + str(feat_seq)
            existing['dna_sequence_length'] = len(existing['dna_sequence'])
            return

        # The following is common to all the feature types
        out_feat = {
            "id": in_feature.get('ID'),
            "type": in_feature['type'],
            "location": [self._location(in_feature)],
            "dna_sequence": str(feat_seq),
            "dna_sequence_length": len(feat_seq),
            "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
        }

        # add optional fields
        if 'note' in in_feature['attributes']:
            out_feat['note'] = in_feature['attributes']["note"][0]
        ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes'])
        if ont:
            out_feat['ontology_terms'] = ont
        aliases = _aliases(in_feature)
        if aliases:
            out_feat['aliases'] = aliases
        if db_xrefs:
            out_feat['db_xrefs'] = db_xrefs
        if 'product' in in_feature['attributes']:
            out_feat['functions'] = in_feature['attributes']["product"]
        if 'product_name' in in_feature['attributes']:
            if "functions" in out_feat:
                out_feat['functions'].extend(
                    in_feature['attributes']["product_name"])
            else:
                out_feat['functions'] = in_feature['attributes'][
                    "product_name"]
        if 'function' in in_feature['attributes']:
            out_feat['functional_descriptions'] = in_feature['attributes'][
                "function"]
        if 'inference' in in_feature['attributes']:
            GenomeUtils.parse_inferences(in_feature['attributes']['inference'])
        if 'trans-splicing' in in_feature['attributes'].get('exception', []):
            out_feat['flags'] = out_feat.get('flags', []) + ['trans_splicing']
        if 'pseudo' in in_feature['attributes'].get('exception', []):
            out_feat['flags'] = out_feat.get('flags', []) + ['pseudo']
        if 'ribosomal-slippage' in in_feature['attributes'].get(
                'exception', []):
            out_feat['flags'] = out_feat.get('flags',
                                             []) + ['ribosomal_slippage']
        parent_id = in_feature.get('Parent', '')
        if parent_id and parent_id not in self.feature_dict:
            raise ValueError(
                "Parent ID: {} was not found in feature ID list.".format(
                    parent_id))

        # if the feature is a exon or UTR, it will only be used to update the
        # location and sequence of it's parent, we add the info to it parent
        # feature but not the feature dict
        if in_feature['type'] in self.skip_types:
            if parent_id and in_feature['type'] in {
                    'exon', 'five_prime_UTR', 'three_prime_UTR'
            }:
                parent = self.feature_dict[parent_id]
                if in_feature['type'] not in parent:
                    parent[in_feature['type']] = []
                parent[in_feature['type']].append(out_feat)
            return

        # add type specific features
        elif 'gene' in in_feature['type']:
            out_feat['protein_translation_length'] = 0
            out_feat['cdss'] = []

        elif in_feature['type'] == 'CDS':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'cdss' in parent:  # parent must be a gene
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings[
                                "genes_CDS_child_fails_location_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"] = out_feat.get('warnings', []) + [
                            warnings[
                                "CDS_fail_child_of_gene_coordinate_validation"]
                            .format(parent_id)
                        ]
                    parent['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                else:  # parent must be mRNA
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings["mRNA_fail_parent_coordinate_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"] = out_feat.get('warnings', []) + [
                            warnings[
                                "CDS_fail_child_of_mRNA_coordinate_validation"]
                            .format(parent_id)
                        ]
                    parent['cds'] = in_feature['ID']
                    out_feat['parent_mrna'] = parent_id
                    parent_gene = self.feature_dict[parent['parent_gene']]
                    parent_gene['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent['parent_gene']
            # keep track of CDSs for post processing
            self.cdss.add(out_feat['id'])

        elif in_feature['type'] == 'mRNA':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'mrnas' not in parent:
                    parent['mrnas'] = []
                if 'cdss' in parent:  # parent must be a gene
                    parent['mrnas'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings["genes_mRNA_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"] = out_feat.get('warnings', []) + [
                        warnings["mRNAs_parent_gene_fails_location_validation"]
                        .format(parent_id)
                    ]

        else:
            out_feat["type"] = in_feature['type']
            # this prevents big misc_features from blowing up the genome size
            if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
                del out_feat['dna_sequence']
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'children' not in parent:
                    parent['children'] = []
                parent['children'].append(out_feat['id'])
                out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings[
                            "generic_parents_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"] = out_feat.get('warnings', []) + [
                        warnings[
                            "generic_childs_parent_fails_location_validation"].
                        format(parent_id)
                    ]

        self.feature_dict[out_feat['id']] = out_feat

    def _process_cdss(self):
        """Because CDSs can have multiple fragments, it's necessary to go
        back over them to calculate a final protein sequence"""
        for cds_id in self.cdss:
            cds = self.feature_dict[cds_id]
            try:
                prot_seq = str(
                    Seq(cds['dna_sequence']).translate(self.code_table,
                                                       cds=True).strip("*"))
            except TranslationError as e:
                cds['warnings'] = cds.get('warnings', []) + [str(e)]
                prot_seq = ""

            cds.update({
                "protein_translation":
                prot_seq,
                "protein_md5":
                hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
                "protein_translation_length":
                len(prot_seq),
            })
            if 'parent_gene' in cds:
                parent_gene = self.feature_dict[cds['parent_gene']]
                # no propigation for now
                propagate_cds_props_to_gene(cds, parent_gene)
            elif self.generate_genes:
                spoof = copy.copy(cds)
                spoof['type'] = 'gene'
                spoof['id'] = cds['id'] + "_gene"
                spoof['cdss'] = [cds['id']]
                spoof['warnings'] = [
                    warnings['spoofed_gene'].format(cds['id'])
                ]
                self.feature_dict[spoof['id']] = spoof
                cds['parent_gene'] = spoof['id']
                self.spoof_gene_count += 1
            else:
                raise ValueError(warnings['no_spoof'])

            self.feature_dict[cds['id']] = cds

    def _update_from_exons(self, feature):
        """This function updates the sequence and location of a feature based
            on it's UTRs, CDSs and exon information"""

        # note that start and end here are in direction of translation
        def start(loc):
            return loc[0][1]

        def end(loc):
            if loc[-1][2] == "+":
                return loc[-1][1] + loc[-1][3] + 1
            else:
                return loc[-1][1] - loc[-1][3] - 1

        if 'exon' in feature:
            # update the feature with the exon locations and sequences
            feature['location'] = [x['location'][0] for x in feature['exon']]
            feature['dna_sequence'] = "".join(x['dna_sequence']
                                              for x in feature['exon'])
            feature['dna_sequence_length'] = len(feature['dna_sequence'])

        # construct feature location from utrs and cdss if present
        elif 'cds' in feature:
            cds = [copy.deepcopy(self.feature_dict[feature['cds']])]
            locs = []
            seq = ""
            for frag in feature.get('five_prime_UTR', []) + cds + \
                    feature.get('three_prime_UTR', []):

                # merge into last location if adjacent
                if locs and abs(end(locs) - start(frag['location'])) == 1:
                    # extend the location length by the length of the first
                    # location in the fragment
                    first = frag['location'].pop(0)
                    locs[-1][3] += first[3]

                locs.extend(frag['location'])
                seq += frag['dna_sequence']

            feature['location'] = locs
            feature['dna_sequence'] = seq
            feature['dna_sequence_length'] = len(seq)

        # remove these properties as they are no longer needed
        for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']:
            feature.pop(x, None)

        else:
            ValueError('Feature {} must contain either exon or cds data to '
                       'construct an accurate location and sequence'.format(
                           feature['id']))

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         source, source_id, assembly, input_gff_file,
                         molecule_type):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome['molecule_type'] = molecule_type
        genome["features"] = []
        genome["cdss"] = []
        genome["mrnas"] = []
        genome['non_coding_features'] = []
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]
        genome['md5'] = assembly['md5']
        genome['contig_ids'], genome['contig_lengths'] = zip(
            *[(k, v['length']) for k, v in assembly['contigs'].items()])
        genome['num_contigs'] = len(assembly['contigs'])
        genome['ontologies_present'] = dict(self.ontologies_present)
        genome['ontology_events'] = self.ontology_events
        genome['taxonomy'], genome['taxon_ref'], genome['domain'], \
            genome["genetic_code"] = self.gi.retrieve_taxon(self.taxon_wsname,
                                                            genome['scientific_name'])
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            source)
        genome['source_id'] = source_id

        # Phytozome gff files are not compatible with the RNASeq Pipeline
        # so it's better to build from the object than cache the file
        if self.is_phytozome:
            gff_file_to_shock = self.dfu.file_to_shock({
                'file_path': input_gff_file,
                'make_handle': 1,
                'pack': "gzip"
            })
            genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid']

        for feature in self.feature_dict.values():
            self.feature_counts[feature['type']] += 1
            if 'exon' in feature or feature['type'] == 'mRNA':
                self._update_from_exons(feature)

            # Test if location order is in order.
            is_transpliced = "flags" in feature and "trans_splicing" in feature[
                "flags"]
            if not is_transpliced and len(feature["location"]) > 1:
                # Check the order only if not trans_spliced and has more than 1 location.
                location_warning = self._check_location_order(
                    feature["location"])
                if location_warning is not None:
                    feature["warnings"] = feature.get('warnings',
                                                      []) + [location_warning]

            contig_len = genome["contig_lengths"][genome["contig_ids"].index(
                feature["location"][0][0])]
            feature = check_full_contig_length_or_multi_strand_feature(
                feature, is_transpliced, contig_len, self.skip_types)

            # sort features into their respective arrays
            if feature['type'] == 'CDS':
                del feature['type']
                genome['cdss'].append(feature)
            elif feature['type'] == 'mRNA':
                del feature['type']
                genome['mrnas'].append(feature)
            elif feature['type'] == 'gene':
                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in feature:
                        feature[key] = list(set(feature[key]))
                if feature['cdss']:
                    del feature['type']
                    self.feature_counts["protein_encoding_gene"] += 1
                    genome['features'].append(feature)
                else:
                    feature.pop('mrnas', None)
                    feature.pop('cdss', None)
                    feature.pop('protein_translation_length', None)
                    self.feature_counts["non_coding_features"] += 1
                    genome['non_coding_features'].append(feature)
            else:
                genome['non_coding_features'].append(feature)

        if self.warnings:
            genome['warnings'] = self.warnings
        genome['feature_counts'] = dict(self.feature_counts)
        return genome
예제 #8
0
class CocacolaUtil:
    CONCOCT_BASE_PATH = '/kb/deployment/bin/CONCOCT'
    COCACOLA_BASE_PATH = '/kb/module/lib/kb_cocacola/bin/COCACOLA-python'
    BINNER_RESULT_DIRECTORY = 'cocacola_output_dir'
    BINNER_BIN_RESULT_DIR = 'final_bins'
    MAPPING_THREADS = 16
    BBMAP_MEM = '30g'

    def __init__(self, config):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.mgu = MetagenomeUtils(self.callback_url)

    def _validate_run_cocacola_params(self, task_params):
        """
        _validate_run_cocacola_params:
                validates params passed to run_cocacola method
        """
        log('Start validating run_cocacola params')

        # check for required parameters
        for p in ['assembly_ref', 'binned_contig_name', 'workspace_name', 'reads_list', 'read_mapping_tool']:
            if p not in task_params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        os.chdir(self.scratch)
        log('Start executing command:\n{}'.format(command))
        log('Command is running from:\n{}'.format(self.scratch))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    # this function has been customized to return read_type variable (interleaved vs single-end library)
    def stage_reads_list_file(self, reads_list):
        """
        stage_reads_list_file: download fastq file associated to reads to scratch area
                          and return result_file_path
        """

        log('Processing reads object list: {}'.format(reads_list))

        result_file_path = []
        read_type = []

        # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch.
        reads = self.ru.download_reads({'read_libraries': reads_list, 'interleaved': None})['files']

        # reads_list is the list of file paths on workspace? (i.e. 12804/1/1).
        # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and
        # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others.
        for read_obj in reads_list:
            files = reads[read_obj]['files']    # 'files' is dictionary where 'fwd' is key of file path on scratch.
            result_file_path.append(files['fwd'])
            read_type.append(files['type'])
            if 'rev' in files and files['rev'] is not None:
                result_file_path.append(files['rev'])

        return result_file_path, read_type

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contig file from GenomeAssembly object
        """
        contig_file = self.au.get_assembly_as_fasta({'ref': assembly_ref}).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path']

        return contig_file

    def retrieve_and_clean_assembly(self, task_params):
        if os.path.exists(task_params['contig_file_path']):
            assembly = task_params['contig_file_path']
            print("FOUND ASSEMBLY ON LOCAL SCRATCH")
        else:
            # we are on njsw so lets copy it over to scratch
            assembly = self._get_contig_file(task_params['assembly_ref'])

        # remove spaces from fasta headers because that breaks bedtools
        assembly_clean = os.path.abspath(assembly).split('.fa')[0] + "_clean.fa"

        command = '/bin/bash reformat.sh in={} out={} addunderscore overwrite=true'.format(assembly, assembly_clean)

        log('running reformat command: {}'.format(command))
        out, err = self._run_command(command)

        return assembly_clean

    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = os.path.abspath(fasta_file_path).split('.fa')[0] + "_filtered.fa"

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file, bbstats_output_file):
        """
        generate_command: bbtools stats.sh command
        """
        log("running generate_stats_for_genome_bins on {}".format(genome_bin_fna_file))
        genome_bin_fna_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file)
        command = '/bin/bash stats.sh in={} format=3 > {}'.format(genome_bin_fna_file, bbstats_output_file)
        self._run_command(command)
        bbstats_output = open(bbstats_output_file, 'r').readlines()[1]
        n_scaffolds = bbstats_output.split('\t')[0]
        n_contigs = bbstats_output.split('\t')[1]
        scaf_bp = bbstats_output.split('\t')[2]
        contig_bp = bbstats_output.split('\t')[3]
        gap_pct = bbstats_output.split('\t')[4]
        scaf_N50 = bbstats_output.split('\t')[5]
        scaf_L50 = bbstats_output.split('\t')[6]
        ctg_N50 = bbstats_output.split('\t')[7]
        ctg_L50 = bbstats_output.split('\t')[8]
        scaf_N90 = bbstats_output.split('\t')[9]
        scaf_L90 = bbstats_output.split('\t')[10]
        ctg_N90 = bbstats_output.split('\t')[11]
        ctg_L90 = bbstats_output.split('\t')[12]
        scaf_max = bbstats_output.split('\t')[13]
        ctg_max = bbstats_output.split('\t')[14]
        scaf_n_gt50K = bbstats_output.split('\t')[15]
        scaf_pct_gt50K = bbstats_output.split('\t')[16]
        gc_avg = float(bbstats_output.split('\t')[17]) * 100  # need to figure out if correct
        gc_std = float(bbstats_output.split('\t')[18]) * 100  # need to figure out if correct

        log('Generated generate_stats_for_genome_bins command: {}'.format(command))

        return {'n_scaffolds': n_scaffolds,
                'n_contigs': n_contigs,
                'scaf_bp': scaf_bp,
                'contig_bp': contig_bp,
                'gap_pct': gap_pct,
                'scaf_N50': scaf_N50,
                'scaf_L50': scaf_L50,
                'ctg_N50': ctg_N50,
                'ctg_L50': ctg_L50,
                'scaf_N90': scaf_N90,
                'scaf_L90': scaf_L90,
                'ctg_N90': ctg_N90,
                'ctg_L90': ctg_L90,
                'scaf_max': scaf_max,
                'ctg_max': ctg_max,
                'scaf_n_gt50K': scaf_n_gt50K,
                'scaf_pct_gt50K': scaf_pct_gt50K,
                'gc_avg': gc_avg,
                'gc_std': gc_std
                }

    def deinterlace_raw_reads(self, fastq):
        fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq"
        fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq"
        command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format(fastq, fastq_forward, fastq_reverse)
        self._run_command(command)
        return (fastq_forward, fastq_reverse)

    def run_read_mapping_interleaved_pairs_mode(self, task_params, assembly_clean, fastq, sam):
        read_mapping_tool = task_params['read_mapping_tool']
        log("running {} mapping in interleaved mode.".format(read_mapping_tool))
        if task_params['read_mapping_tool'] == 'bbmap':
            command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM)
            command += 'threads={} '.format(self.MAPPING_THREADS)
            command += 'ref={} '.format(assembly_clean)
            command += 'in={} '.format(fastq)
            command += 'out={} '.format(sam)
            command += 'fast interleaved=true mappedonly nodisk overwrite'
        elif task_params['read_mapping_tool'] == 'bwa':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            command = 'bwa index {} && '.format(assembly_clean)
            command += 'bwa mem -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} '.format(fastq_forward)
            command += '{} > '.format(fastq_reverse)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_default':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 -x {} '.format(bt2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 --very-sensitive -x {} '.format(bt2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'minimap2':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} '.format(fastq_forward)
            command += '{} > '.format(fastq_reverse)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'hisat2':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            ht2index = os.path.basename(assembly_clean) + '.ht2'
            command = 'hisat2-build {} '.format(assembly_clean)
            command += '{} && '.format(ht2index)
            command += 'hisat2 -x {} '.format(ht2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '-S {} '.format(sam)
            command += '--threads {}'.format(self.MAPPING_THREADS)
        log('running alignment command: {}'.format(command))
        out, err = self._run_command(command)

    def run_read_mapping_unpaired_mode(self, task_params, assembly_clean, fastq, sam):
        read_mapping_tool = task_params['read_mapping_tool']
        log("running {} mapping in single-end (unpaired) mode.".format(read_mapping_tool))
        if task_params['read_mapping_tool'] == 'bbmap':
            command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM)
            command += 'threads={} '.format(self.MAPPING_THREADS)
            command += 'ref={} '.format(assembly_clean)
            command += 'in={} '.format(fastq)
            command += 'out={} '.format(sam)
            command += 'fast interleaved=false mappedonly nodisk overwrite'
            # BBMap is deterministic without the deterministic flag if using single-ended reads
        elif task_params['read_mapping_tool'] == 'bwa':
            command = 'bwa index {} && '.format(assembly_clean)
            command += 'bwa mem -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} > '.format(fastq)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_default':
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 -x {} '.format(bt2index)
            command += '-U {} '.format(fastq)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive':
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 --very-sensitive -x {} '.format(bt2index)
            command += '-U {} '.format(fastq)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'minimap2':
            command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} > '.format(fastq)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'hisat2':
            ht2index = os.path.basename(assembly_clean) + '.ht2'
            command = 'hisat2-build {} '.format(assembly_clean)
            command += '{} && '.format(ht2index)
            command += 'hisat2 -x {} '.format(ht2index)
            command += '-U {} '.format(fastq)
            command += '-S {} '.format(sam)
            command += '--threads {}'.format(self.MAPPING_THREADS)
        log('running alignment command: {}'.format(command))
        out, err = self._run_command(command)

    def convert_sam_to_sorted_and_indexed_bam(self, sam):
        # create bam files from sam files
        sorted_bam = os.path.abspath(sam).split('.sam')[0] + "_sorted.bam"

        command = 'samtools view -F 0x04 -uS {} | '.format(sam)
        command += 'samtools sort - -o {}'.format(sorted_bam)

        log('running samtools command to generate sorted bam: {}'.format(command))
        self._run_command(command)

        # verify we got bams
        if not os.path.exists(sorted_bam):
            log('Failed to find bam file\n{}'.format(sorted_bam))
            sys.exit(1)
        elif(os.stat(sorted_bam).st_size == 0):
            log('Bam file is empty\n{}'.format(sorted_bam))
            sys.exit(1)

        # index the bam file
        command = 'samtools index {}'.format(sorted_bam)

        log('running samtools command to index sorted bam: {}'.format(command))
        self._run_command(command)

        return sorted_bam

    def generate_alignment_bams(self, task_params, assembly_clean):
        """
            This function runs the selected read mapper and creates the
            sorted and indexed bam files from sam files using samtools.
        """

        reads_list = task_params['reads_list']

        (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list)

        sorted_bam_file_list = []

        # list of reads files, can be 1 or more. assuming reads are either type unpaired or interleaved
        # will not handle unpaired forward and reverse reads input as seperate (non-interleaved) files

        for i in range(len(read_scratch_path)):
            fastq = read_scratch_path[i]
            fastq_type = read_type[i]

            sam = os.path.basename(fastq).split('.fastq')[0] + ".sam"
            sam = os.path.join(self.BINNER_RESULT_DIRECTORY, sam)

            if fastq_type == 'interleaved':  # make sure working - needs tests
                log("Running interleaved read mapping mode")
                self.run_read_mapping_interleaved_pairs_mode(task_params, assembly_clean, fastq, sam)
            else:  # running read mapping in single-end mode
                log("Running unpaired read mapping mode")
                self.run_read_mapping_unpaired_mode(task_params, assembly_clean, fastq, sam)

            sorted_bam = self.convert_sam_to_sorted_and_indexed_bam(sam)

            sorted_bam_file_list.append(sorted_bam)

        return sorted_bam_file_list

    def generate_make_coverage_table_command(self, task_params, sorted_bam_file_list):
        # create the depth file for this bam
        #
        min_contig_length = task_params['min_contig_length']
        sorted_bam = task_params['sorted_bam']

        depth_file_path = os.path.join(self.scratch, str('cocacola_depth.txt'))
        command = '/kb/module/lib/kb_cocacola/bin/jgi_summarize_bam_contig_depths '
        command += '--outputDepth {} '.format(depth_file_path)
        command += '--minContigLength {} '.format(min_contig_length)
        command += '--minContigDepth 1 {}'.format(sorted_bam)

        log('running summarize_bam_contig_depths command: {}'.format(command))
        self._run_command(command)

        return depth_file_path

    def generate_cocacola_cut_up_fasta_command(self, task_params):
        """
        generate_command: cocacola cut_up_fasta
        """
        contig_file_path = task_params['contig_file_path']
        contig_split_size = task_params['contig_split_size']
        contig_split_overlap = task_params['contig_split_overlap']

        log("\n\nRunning generate_cocacola_cut_up_fasta_command")

        command = 'python {}/scripts/cut_up_fasta.py '.format(self.CONCOCT_BASE_PATH)
        command += '{} '.format(contig_file_path)
        command += '-c {} '.format(contig_split_size)
        command += '-o {} '.format(contig_split_overlap)
        command += '--merge_last -b temp.bed > {}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)
        log('Generated cocacola_cut_up_fasta command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_input_table_from_bam(self, task_params):
        """
        generate_command: cocacola generate input table
        """
        log("\n\nRunning generate_cocacola_input_table_from_bam")
        command = 'python {}/scripts/gen_input_table.py '.format(self.CONCOCT_BASE_PATH)

        command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{}/*_sorted.bam > '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{}/coverage_table.tsv'.format(self.BINNER_RESULT_DIRECTORY)
        log('Generated cocacola generate input table from bam command: {}'.format(command))
        calc_contigs = 0
        for line in open('{}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)):
            if line.startswith(">"):
                calc_contigs += 1
        task_params['calc_contigs'] = calc_contigs
        self._run_command(command)

    def generate_cocacola_kmer_composition_table(self, task_params):
        """
        generate_command: cocacola generate kmer composition table
        """
        log("\n\nRunning generate_cocacola_kmer_composition_table")
        calc_contigs = task_params['calc_contigs']
        kmer_size = task_params['kmer_size']
        command = 'python {}/scripts/fasta_to_features.py '.format(self.CONCOCT_BASE_PATH)
        command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{} '.format(calc_contigs)
        command += '{} '.format(kmer_size)
        command += '{}/split_contigs_kmer_{}.csv'.format(self.BINNER_RESULT_DIRECTORY, kmer_size)
        log('Generated cocacola generate input table from bam command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_command(self, task_params):
        """
        generate_command: cocacola
        """

        min_contig_length = task_params['min_contig_length']
        kmer_size = task_params['kmer_size']

        log("\n\nRunning generate_cocacola_command")
        command = 'python {}/cocacola.py '.format(self.COCACOLA_BASE_PATH)
        command += '--contig_file {}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--abundance_profiles {}/coverage_table.tsv '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--composition_profiles {}/split_contigs_kmer_{}.csv '.format(self.BINNER_RESULT_DIRECTORY,
                                                                                 kmer_size)
        command += '--output {}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                           min_contig_length)

        log('Generated cocacola command: {}'.format(command))

        self._run_command(command)

    def add_header_to_post_clustering_file(self, task_params):
        min_contig_length = task_params['min_contig_length']
        header = "contig_id,cluster_id"
        with open('{}/cocacola_output_clusters_min{}_headers.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                         min_contig_length), 'w') as outfile:
            outfile.write(header)
            with open('{}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                     min_contig_length), 'r') as datafile:
                for line in datafile:
                    outfile.write(line)

    def generate_cocacola_post_clustering_merging_command(self, task_params):
        """
        generate_command: cocacola post cluster merging
        """
        min_contig_length = task_params['min_contig_length']
        log("\n\nRunning generate_cocacola_post_clustering_merging_command")

        command = 'python {}/scripts/merge_cutup_clustering.py '.format(self.CONCOCT_BASE_PATH)
        command += '{}/cocacola_output_clusters_min{}_headers.csv > '.format(self.BINNER_RESULT_DIRECTORY,
                                                                             min_contig_length)
        command += '{}/clustering_merged_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length)
        log('Generated generate_cocacola_post_clustering_merging command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_extract_fasta_bins_command(self, task_params):
        """
        generate_command: cocacola extract_fasta_bins
        """
        log("\n\nRunning generate_cocacola_extract_fasta_bins_command")

        contig_file_path = task_params['contig_file_path']
        min_contig_length = task_params['min_contig_length']

        bin_result_directory = self.BINNER_RESULT_DIRECTORY + '/' + self.BINNER_BIN_RESULT_DIR
        self._mkdir_p(bin_result_directory)
        command = 'python {}/scripts/extract_fasta_bins.py '.format(self.CONCOCT_BASE_PATH)
        command += '{} '.format(contig_file_path)
        command += '{}/clustering_merged_min{}.csv '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length)
        command += '--output_path {}/{}'.format(self.BINNER_RESULT_DIRECTORY, self.BINNER_BIN_RESULT_DIR)
        log('Generated generate_cocacola_extract_fasta_bins_command command: {}'.format(command))

        self._run_command(command)

    def rename_and_standardize_bin_names(self, task_params):
        """
        generate_command: generate renamed bins
        """
        log("\n\nRunning rename_and_standardize_bin_names")
        path_to_cocacola_result_bins = os.path.abspath(self.BINNER_RESULT_DIRECTORY) + \
            '/' + self.BINNER_BIN_RESULT_DIR + '/'
        for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins):
            for file in files:
                if file.endswith('.fa'):
                    os.rename(os.path.abspath(path_to_cocacola_result_bins) + '/' +
                              file, os.path.abspath(path_to_cocacola_result_bins) + '/bin.' +
                              file.split('.fa')[0].zfill(3) + '.fasta')  # need to change to 4 digits

    def make_binned_contig_summary_file_for_binning_apps(self, task_params):
        """
        generate_command: generate binned contig summary command
        """
        log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
        path_to_cocacola_result = os.path.abspath(self.BINNER_RESULT_DIRECTORY)
        path_to_cocacola_result_bins = '{}/{}/'.format(path_to_cocacola_result, self.BINNER_BIN_RESULT_DIR)
        path_to_summary_file = path_to_cocacola_result_bins + 'binned_contig.summary'
        with open(path_to_summary_file, 'w+') as f:
            f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
            for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins):
                for file in files:
                    if file.endswith('.fasta'):
                        genome_bin_fna_file = os.path.join(self.BINNER_BIN_RESULT_DIR, file)
                        bbstats_output_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY,
                                                           genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout"
                        bbstats_output = self.generate_stats_for_genome_bins(task_params,
                                                                             genome_bin_fna_file,
                                                                             bbstats_output_file)
                        f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1],
                                                         bbstats_output['contig_bp'],
                                                         bbstats_output['gc_avg']))
        f.close()
        log('Finished make_binned_contig_summary_file_for_binning_apps function')

    def generate_output_file_list(self, result_directory):
        """
        generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cocacola_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:

            for dirname, subdirs, files in os.walk(result_directory):
                for file in files:
                    if (file.endswith('.sam') or
                        file.endswith('.bam') or
                        file.endswith('.bai') or
                       file.endswith('.summary')):
                            continue
                    if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                            continue
                    zip_file.write(os.path.join(dirname, file), file)
                if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                    baseDir = os.path.basename(dirname)
                    for file in files:
                        full = os.path.join(dirname, file)
                        zip_file.write(full, os.path.join(baseDir, file))

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'Files generated by CONCOCT App'})

        return output_files

    def generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref):
        """
        generate_html_report: generate html summary report
        """

        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        # get summary data from existing assembly object and bins_objects
        Summary_Table_Content = ''
        Overview_Content = ''
        (binned_contig_count, input_contig_count, total_bins_count) = \
            self.generate_overview_info(assembly_ref, binned_contig_obj_ref, result_directory)

        Overview_Content += '<p>Binned contigs: {}</p>'.format(binned_contig_count)
        Overview_Content += '<p>Input contigs: {}</p>'.format(input_contig_count)
        Overview_Content += '<p>Number of bins: {}</p>'.format(total_bins_count)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          Overview_Content)
                report_template = report_template.replace('Summary_Table_Content',
                                                          Summary_Table_Content)
                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for kb_cocacola App'})
        return html_report

    def generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory):
        """
        _generate_overview_info: generate overview information from assembly and binnedcontig
        """

        # get assembly and binned_contig objects that already have some data populated in them
        assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0]
        binned_contig = self.dfu.get_objects({'object_refs': [binned_contig_obj_ref]})['data'][0]

        input_contig_count = assembly.get('data').get('num_contigs')
        binned_contig_count = 0
        total_bins_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))

        return (binned_contig_count, input_contig_count, total_bins_count)

    def generate_report(self, binned_contig_obj_ref, task_params):
        """
        generate_report: generate summary report
        """
        log('Generating report')

        result_directory = os.path.join(self.scratch, "cocacola_output_dir")

        task_params['result_directory'] = result_directory

        output_files = self.generate_output_file_list(task_params['result_directory'])

        output_html_files = self.generate_html_report(task_params['result_directory'],
                                                      task_params['assembly_ref'],
                                                      binned_contig_obj_ref)

        report_params = {
            'message': '',
            'workspace_name': task_params['workspace_name'],
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 266,
            'report_object_name': 'kb_cocacola_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def create_dict_from_depth_file(self, depth_file_path):
        # keep contig order (required by metabat2)
        depth_file_dict = {}
        with open(depth_file_path, 'r') as f:
            header = f.readline().rstrip().split("\t")
            # print('HEADER1 {}'.format(header))
            # map(str.strip, header)
            for line in f:
                # deal with cases were fastq name has spaces.Assume first
                # non white space word is unique and use this as ID.
                # line = line.rstrip()
                vals = line.rstrip().split("\t")
                if ' ' in vals[0]:
                    ID = vals[0].split()[0]
                else:
                    ID = vals[0]
                depth_file_dict[ID] = vals[1:]
            depth_file_dict['header'] = header
        return depth_file_dict

    def run_cocacola(self, task_params):
        """
        run_cocacola: cocacola app

        required params:
            assembly_ref: Metagenome assembly object reference
            binned_contig_name: BinnedContig object name and output file header
            workspace_name: the name of the workspace it gets saved to.
            reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary)
            upon which CONCOCT will be run

        optional params:
            min_contig_length: minimum contig length; default 1000

            ref: https://github.com/BinPro/CONCOCT/blob/develop/README.md
        """
        log('--->\nrunning CocacolaUtil.run_cocacola\n' +
            'task_params:\n{}'.format(json.dumps(task_params, indent=1)))

        self._validate_run_cocacola_params(task_params)

        # get assembly
        contig_file = self._get_contig_file(task_params['assembly_ref'])
        task_params['contig_file_path'] = contig_file

        # clean the assembly file so that there are no spaces in the fasta headers
        assembly_clean = self.retrieve_and_clean_assembly(task_params)

        assembly_clean_temp = self.filter_contigs_by_length(assembly_clean, task_params['min_contig_length'])

        task_params['contig_file_path'] = assembly_clean_temp
        assembly_clean = assembly_clean_temp  # need to clean this up, ugly redundant variable usage

        # get reads
        (reads_list_file, read_type) = self.stage_reads_list_file(task_params['reads_list'])
        task_params['read_type'] = read_type
        task_params['reads_list_file'] = reads_list_file

        # prep result directory
        result_directory = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY)
        self._mkdir_p(result_directory)

        cwd = os.getcwd()
        log('changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)

        # run alignments, and update input contigs to use the clean file
        # this function has an internal loop to generate a sorted bam file for each input read file
        self.generate_alignment_bams(task_params, assembly_clean)

        # not used right now
        # depth_file_path = self.generate_make_coverage_table_command(task_params, sorted_bam_file_list)
        # depth_dict = self.create_dict_from_depth_file(depth_file_path)

        # run cocacola prep, cut up fasta input
        self.generate_cocacola_cut_up_fasta_command(task_params)

        # run cococola prep, generate coverage tables from bam
        self.generate_cocacola_input_table_from_bam(task_params)

        # run cococola prep, generate kmer table
        self.generate_cocacola_kmer_composition_table(task_params)

        # run cocacola prep and cocacola
        self.generate_cocacola_command(task_params)

        # run command to add header to output file
        self.add_header_to_post_clustering_file(task_params)

        # run cocacola post cluster merging command
        self.generate_cocacola_post_clustering_merging_command(task_params)

        # run extract bins command
        self.generate_cocacola_extract_fasta_bins_command(task_params)

        # run fasta renaming
        self.rename_and_standardize_bin_names(task_params)

        # make binned contig summary file
        self.make_binned_contig_summary_file_for_binning_apps(task_params)

        # file handling and management
        os.chdir(cwd)
        log('changing working dir to {}'.format(cwd))

        log('Saved result files to: {}'.format(result_directory))
        log('Generated files:\n{}'.format('\n'.join(os.listdir(result_directory))))

        # make new BinnedContig object and upload to KBase
        generate_binned_contig_param = {
            'file_directory': os.path.join(result_directory, self.BINNER_BIN_RESULT_DIR),
            'assembly_ref': task_params['assembly_ref'],
            'binned_contig_name': task_params['binned_contig_name'],
            'workspace_name': task_params['workspace_name']
        }

        binned_contig_obj_ref = \
            self.mgu.file_to_binned_contigs(generate_binned_contig_param).get('binned_contig_obj_ref')

        # generate report
        reportVal = self.generate_report(binned_contig_obj_ref, task_params)
        returnVal = {
            'result_directory': result_directory,
            'binned_contig_obj_ref': binned_contig_obj_ref
        }
        returnVal.update(reportVal)

        return returnVal
예제 #9
0
class DASToolUtil:
    DASTOOL_THREADS = 2
    BINNER_RESULT_DIRECTORY = 'das_tool_output_dir'
    BINNER_BIN_RESULT_DIR = 'das_tool_output_dir_DASTool_bins'

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.mgu = MetagenomeUtils(self.callback_url)

    def validate_run_das_tool_params(self, params):
        """
        validate_run_concoct_params:
                validates params passed to run_concoct method

        """
        log('Start validating run_kb_das_tool params')

        # check for required parameters
        for p in [
                'assembly_ref', 'input_binned_contig_names',
                'output_binned_contig_name', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def mkdir_p(self, path):
        """
        mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def run_command(self, command):
        """
        run_command: run command and print result
        """
        #os.chdir(self.scratch)
        log('Start executing command:\n{}'.format(command))
        log('Command is running from:\n{}'.format(self.scratch))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(
                exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    def get_contig_file(self, assembly_ref):
        """
        get_contig_file: get contif file from GenomeAssembly object
        """

        contig_file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref
        }).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path':
                                            contig_file})['file_path']

        return contig_file

    def retrieve_and_clean_assembly(self, task_params):
        if os.path.exists(task_params['contig_file_path']):
            assembly = task_params['contig_file_path']
            print("FOUND ASSEMBLY ON LOCAL SCRATCH")
        else:
            # we are on njsw so lets copy it over to scratch
            assembly = self.get_contig_file(task_params['assembly_ref'])

        # remove spaces from fasta headers because that breaks bedtools
        assembly_clean = os.path.abspath(assembly).split(
            '.fa')[0] + "_clean.fa"

        command = '/bin/bash reformat.sh in={} out={} addunderscore'.format(
            assembly, assembly_clean)

        log('running reformat command: {}'.format(command))
        out, err = self.run_command(command)

        return assembly_clean

    def generate_output_file_list(self, result_directory):
        """
        generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self.mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'das_tool_result.zip')
        report_file = None

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:

            # grab all files we want to zip
            for dirname, subdirs, files in os.walk(result_directory):
                for file in files:
                    if (file.endswith('.sam') or file.endswith('.bam')
                            or file.endswith('.bai')
                            or file.endswith('.summary')):
                        continue
                    if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                        continue
                    zip_file.write(os.path.join(dirname, file), file)
                if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                    baseDir = os.path.basename(dirname)
                    for file in files:
                        full = os.path.join(dirname, file)
                        zip_file.write(full, os.path.join(baseDir, file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'Files generated by kb_das_tool App'
        })

        return output_files

    def generate_html_report(self, result_directory, assembly_ref,
                             binned_contig_obj_ref):
        """
        generate_html_report: generate html summary report
        """

        log('Start generating html report')
        #html_report = list()

        output_directory = os.path.join(self.scratch,
                                        'html_dir_' + str(uuid.uuid4()))
        self.mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        # get summary data from existing assembly object and bins_objects
        Summary_Table_Content = ''
        Overview_Content = ''
        (binned_contig_count, input_contig_count,
         total_bins_count) = self.generate_overview_info(
             assembly_ref, binned_contig_obj_ref, result_directory)

        # get pdfs
        pdf_filename_l = [
            f for f in os.listdir(self.BINNER_RESULT_DIRECTORY)
            if f.endswith('.pdf')
        ]
        assert len(pdf_filename_l) == 2

        Overview_Content += '<p>Binned contigs: {}</p>'.format(
            binned_contig_count)
        Overview_Content += '<p>Input contigs: {}</p>'.format(
            input_contig_count)
        Overview_Content += '<p>Number of bins: {}</p>'.format(
            total_bins_count)
        for pdf_filename in pdf_filename_l:
            Overview_Content += '\n<embed src="{}" width="1000px" height="700px">'.format(
                pdf_filename)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                report_template = report_template.replace(
                    'Summary_Table_Content', Summary_Table_Content)
                result_file.write(report_template)

        # copy pdfs into html dir
        for pdf_filename in pdf_filename_l:
            shutil.copyfile(
                os.path.join(self.BINNER_RESULT_DIRECTORY, pdf_filename),
                os.path.join(output_directory, pdf_filename))

        # save html dir to shock
        def dir_to_shock(dir_path, name, description):
            '''
            For regular directories or html directories

            name - for regular directories: the name of the flat (zip) file returned to ui
                   for html directories: the name of the html file
            '''
            dfu_fileToShock_ret = self.dfu.file_to_shock({
                'file_path': dir_path,
                'make_handle': 0,
                'pack': 'zip',
            })

            dir_shockInfo = {
                'shock_id': dfu_fileToShock_ret['shock_id'],
                'name': name,
                'description': description
            }

            return dir_shockInfo

        html_shockInfo = dir_to_shock(output_directory, 'report.html',
                                      'Report html for DAS tool')
        """
        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for kb_concoct App'})

        return html_report
        """

        return [html_shockInfo]

    def generate_overview_info(self, assembly_ref, binned_contig_obj_ref,
                               result_directory):
        """
        _generate_overview_info: generate overview information from assembly and binnedcontig
        """

        # get assembly and binned_contig objects that already have some data populated in them
        assembly = self.dfu.get_objects({'object_refs':
                                         [assembly_ref]})['data'][0]
        binned_contig = self.dfu.get_objects(
            {'object_refs': [binned_contig_obj_ref]})['data'][0]

        input_contig_count = assembly.get('data').get('num_contigs')
        bins_directory = os.path.join(self.scratch, result_directory,
                                      self.BINNER_BIN_RESULT_DIR)
        binned_contig_count = 0
        total_bins_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))

        return (binned_contig_count, input_contig_count, total_bins_count)

    def generate_report(self, binned_contig_obj_ref, params):
        """
        generate_report: generate summary report

        """
        log('Generating report')
        params['result_directory'] = self.BINNER_RESULT_DIRECTORY

        output_files = self.generate_output_file_list(
            params['result_directory'])

        output_html_files = self.generate_html_report(
            params['result_directory'], params['assembly_ref'],
            binned_contig_obj_ref)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 500,
            'report_object_name': 'kb_das_tool_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def rename_and_standardize_bin_names(self):
        """
        generate_command: generate renamed bins
        """
        log("\n\nRunning rename_and_standardize_bin_names")
        i = 0
        path_to_result_bins = os.path.join(self.scratch,
                                           self.BINNER_RESULT_DIRECTORY,
                                           "das_tool_output_dir_DASTool_bins")
        path_to_das_tool_key = os.path.abspath(
            path_to_result_bins) + '/das_tool_name_key.tsv'
        with open(path_to_das_tool_key, 'w+') as f:
            f.write("Original.Bin.Name\tRenamed.Bin.Name\n")
            for dirname, subdirs, files in os.walk(path_to_result_bins):
                for file in files:
                    if file.endswith('.fa'):
                        i += 1
                        os.rename(
                            os.path.abspath(path_to_result_bins) + '/' + file,
                            os.path.abspath(path_to_result_bins) + '/bin.' +
                            str(i).zfill(3) +
                            '.fasta')  # need to change to 4 digits
                        f.write(file + '\tbin.' + str(i).zfill(3) + '.fasta\n')

    def make_binned_contig_summary_file_for_binning_apps(self, task_params):
        """
        generate_command: generate binned contig summary command
        """
        log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
        result_directory = task_params['result_directory']
        path_to_result_bins = '{}/{}/'.format(
            result_directory, task_params['bin_result_directory'])
        path_to_summary_file = path_to_result_bins + 'binned_contig.summary'
        with open(path_to_summary_file, 'w+') as f:
            f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
            for dirname, subdirs, files in os.walk(path_to_result_bins):
                for file in files:
                    if file.endswith('.fasta'):
                        genome_bin_fna_file = os.path.join(
                            path_to_result_bins, file)
                        bbstats_output_file = os.path.join(
                            self.scratch, self.BINNER_RESULT_DIRECTORY,
                            genome_bin_fna_file).split(
                                '.fasta')[0] + ".bbstatsout"
                        bbstats_output = self.generate_stats_for_genome_bins(
                            task_params, genome_bin_fna_file,
                            bbstats_output_file)
                        f.write('{}\t0\t{}\t{}\n'.format(
                            genome_bin_fna_file.split("/")[-1],
                            bbstats_output['contig_bp'],
                            bbstats_output['gc_avg']))
        log('Finished make_binned_contig_summary_file_for_binning_apps function'
            )

    #
    # def make_binned_contig_summary_file_for_binning_apps(self, task_params):
    #     """
    #     generate_command: generate binned contig summary command
    #     """
    #     log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
    #     path_to_result = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins")
    #     path_to_summary_file = path_to_result + '/binned_contig.summary'
    #     with open(path_to_summary_file, 'w+') as f:
    #         f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
    #         for dirname, subdirs, files in os.walk(path_to_result):
    #             for file in files:
    #                 if file.endswith('.fasta'):
    #                     genome_bin_fna_file = os.path.join(path_to_result, file)
    #                     bbstats_output_file = os.path.join(path_to_result,
    #                                                        genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout"
    #                     bbstats_output = self.generate_stats_for_genome_bins(task_params,
    #                                                                          genome_bin_fna_file,
    #                                                                          bbstats_output_file)
    #                     f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1],
    #                                                      bbstats_output['contig_bp'],
    #                                                      bbstats_output['gc_avg']))
    #     f.close()
    #     log('Finished make_binned_contig_summary_file_for_binning_apps function')
    #

    def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file,
                                       bbstats_output_file):
        """
        generate_command: bbtools stats.sh command
        """
        log("running generate_stats_for_genome_bins on {}".format(
            genome_bin_fna_file))
        genome_bin_fna_file = os.path.join(self.scratch,
                                           self.BINNER_RESULT_DIRECTORY,
                                           genome_bin_fna_file)
        command = '/bin/bash stats.sh in={} format=3 > {}'.format(
            genome_bin_fna_file, bbstats_output_file)
        self.run_command(command)
        bbstats_output = open(bbstats_output_file, 'r').readlines()[1]
        n_scaffolds = bbstats_output.split('\t')[0]
        n_contigs = bbstats_output.split('\t')[1]
        scaf_bp = bbstats_output.split('\t')[2]
        contig_bp = bbstats_output.split('\t')[3]
        gap_pct = bbstats_output.split('\t')[4]
        scaf_N50 = bbstats_output.split('\t')[5]
        scaf_L50 = bbstats_output.split('\t')[6]
        ctg_N50 = bbstats_output.split('\t')[7]
        ctg_L50 = bbstats_output.split('\t')[8]
        scaf_N90 = bbstats_output.split('\t')[9]
        scaf_L90 = bbstats_output.split('\t')[10]
        ctg_N90 = bbstats_output.split('\t')[11]
        ctg_L90 = bbstats_output.split('\t')[12]
        scaf_max = bbstats_output.split('\t')[13]
        ctg_max = bbstats_output.split('\t')[14]
        scaf_n_gt50K = bbstats_output.split('\t')[15]
        scaf_pct_gt50K = bbstats_output.split('\t')[16]
        gc_avg = float(bbstats_output.split('\t')
                       [17]) * 100  # need to figure out if correct
        gc_std = float(bbstats_output.split('\t')
                       [18]) * 100  # need to figure out if correct

        log('Generated generate_stats_for_genome_bins command: {}'.format(
            command))

        return {
            'n_scaffolds': n_scaffolds,
            'n_contigs': n_contigs,
            'scaf_bp': scaf_bp,
            'contig_bp': contig_bp,
            'gap_pct': gap_pct,
            'scaf_N50': scaf_N50,
            'scaf_L50': scaf_L50,
            'ctg_N50': ctg_N50,
            'ctg_L50': ctg_L50,
            'scaf_N90': scaf_N90,
            'scaf_L90': scaf_L90,
            'ctg_N90': ctg_N90,
            'ctg_L90': ctg_L90,
            'scaf_max': scaf_max,
            'ctg_max': ctg_max,
            'scaf_n_gt50K': scaf_n_gt50K,
            'scaf_pct_gt50K': scaf_pct_gt50K,
            'gc_avg': gc_avg,
            'gc_std': gc_std
        }

    def generate_das_tool_input_files_and_commands_from_binned_contigs(
            self, params):
        #params['binned_contig_list_file'] = binned_contig_list_file
        binned_contig_names = params['input_binned_contig_names']
        trimmed_binned_contig_name_list = []
        contig_to_bin_file_name_list = []
        for input_ref in binned_contig_names:
            # next line needed for testing
            # binned_contig = self.dfu.get_objects({'object_refs': [input_ref['binned_contig_obj_ref']]})['data'][0]

            # next line needed in production only
            binned_contig = self.dfu.get_objects({'object_refs':
                                                  [input_ref]})['data'][0]
            binned_contig_name = binned_contig.get('info')[1]
            binned_contig_data = binned_contig.get('data')
            bins = binned_contig_data.get('bins')
            trimmed_binned_contig_name = binned_contig_name.split(
                ".BinnedContig")[0]
            trimmed_binned_contig_name_list.append(trimmed_binned_contig_name)
            contig_to_bin_file_name = "{}_contigs_to_bins.tsv".format(
                trimmed_binned_contig_name)
            contig_to_bin_file_name_list.append(contig_to_bin_file_name)

            f = open(contig_to_bin_file_name, "w+")
            for bin in bins:
                bin_id = bin.get('bid')
                trimmed_bin_id = bin_id.split(".fasta")[0]
                contigs = bin.get('contigs')
                for contig_id, contig_value in contigs.items():
                    f.write("{}\t{}.{}\n".format(contig_id,
                                                 trimmed_binned_contig_name,
                                                 trimmed_bin_id))
            f.close()
        #contig_to_bin_file_name_list = self.BINNER_RESULT_DIRECTORY + contig_to_bin_file_name
        # temp = str(self.BINNER_RESULT_DIRECTORY) + '/'
        # contig_to_bin_file_name_list = [temp + s for s in contig_to_bin_file_name_list]

        return (trimmed_binned_contig_name_list, contig_to_bin_file_name_list)

    def generate_das_tool_command(self, params,
                                  trimmed_binned_contig_name_list,
                                  contig_to_bin_file_name_list):
        """
        generate_command: generate concoct params
        """

        print("\n\nRunning generate_das_tool_command")

        command = 'DAS_Tool '

        command += '-i {} '.format(contig_to_bin_file_name_list)
        command += '-l {} '.format(trimmed_binned_contig_name_list)
        command += '-c {} '.format(params.get('contig_file_path'))
        command += '-o {} '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--search_engine {} '.format(params.get('search_engine'))
        command += '--score_threshold {} '.format(
            params.get('score_threshold'))
        command += '--duplicate_penalty {} '.format(
            params.get('duplicate_penalty'))
        command += '--megabin_penalty {} '.format(
            params.get('megabin_penalty'))
        command += '--write_bin_evals {} '.format(
            params.get('write_bin_evals'))
        command += '--create_plots {} '.format(params.get('create_plots'))
        command += '--write_bins 1 '
        command += '--write_unbinned 0 '
        command += '-t {}'.format(self.DASTOOL_THREADS)

        log('Generated das_tool command: {}'.format(command))

        return command

    def run_das_tool(self, params):
        """
        run_das_tool: DAS_Tool app

        required params:
            assembly_ref: Metagenome assembly object reference
            input_binned_contig_names: list of BinnedContig objects
            output_binned_contig_name: output BinnedContig object name
            workspace_name: the name of the workspace it gets saved to.

        optional params:
            search_engine; default diamond
            score_threshold; default 0.5
            duplicate_penalty; default 0.6
            megabin_penalty; default 0.5
            write_bin_evals; default 1
            create_plots; default 1
            write_bins; default 1
            write_unbinned; default 0

        ref: https://github.com/cmks/DAS_Tool
        """
        log('--->\nrunning DASToolUtil.run_das_tool\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_run_das_tool_params(params)

        print("\n\nFinished running validate_run_das_tool_params")
        #
        contig_file = self.get_contig_file(params.get('assembly_ref'))
        params['contig_file_path'] = contig_file

        result_directory = os.path.join(self.scratch,
                                        self.BINNER_RESULT_DIRECTORY)
        params['result_directory'] = result_directory
        self.mkdir_p(result_directory)

        cwd = os.getcwd()
        log('Changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)

        (
            trimmed_binned_contig_name_list, contig_to_bin_file_name_list
        ) = self.generate_das_tool_input_files_and_commands_from_binned_contigs(
            params)
        comma_symbol = ','
        trimmed_binned_contig_name_list = comma_symbol.join(
            trimmed_binned_contig_name_list)
        contig_to_bin_file_name_list = comma_symbol.join(
            contig_to_bin_file_name_list)

        log(os.listdir(result_directory))
        log("trimmed_binned_contig_name_list {}".format(
            trimmed_binned_contig_name_list))
        log("contig_to_bin_file_name_list {}".format(
            contig_to_bin_file_name_list))

        # binned_contig_to_file_params = {
        #     'input_ref': input_ref['binned_contig_obj_ref'],
        #     'save_to_shock': 1,
        #     'bin_file_directory': '{}/bin_set_{}/'.format(result_directory, i),
        #     'workspace_name': params.get('workspace_name'),
        # }
        #
        # self.mgu.binned_contigs_to_file(binned_contig_to_file_params) # returns "binned_contig_obj_ref" of type "obj_ref" (An X/Y/Z style reference)

        #shutil.copytree(bin_file_directory, os.path.join(result_directory, bin_file_directory))
        #print('\n\n\n result: {}'.format(self.mgu.binned_contigs_to_file(binned_contig_to_file_params)))

        #run concoct
        command = self.generate_das_tool_command(
            params, trimmed_binned_contig_name_list,
            contig_to_bin_file_name_list)
        log('\nWorking dir is {}'.format(result_directory))
        log('\nWorking dir is {}'.format(os.getcwd()))
        log('Changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)
        self.run_command(command)

        os.chdir(self.scratch)

        task_params = {}
        task_params['result_directory'] = os.path.join(self.scratch)
        task_params['bin_result_directory'] = os.path.join(
            self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins")

        # check to make sure bins were generated, otherwise no need to run the rest
        if not os.path.exists(task_params['bin_result_directory']):
            log('DAS_Tool did not succeed in generating a set of bins using the input bins and parameters - skipping the creation of a new BinnedContig object.'
                )
            log('Note: this result is sometimes expected using the DAS-Tool workflow; it is possible that DAS-Tool cannot optimize the input binned contigs.'
                )
            log('KBase is aware of this error!')
            log('Currently KBase manages this run instance as an error because KBase is expecting an output set of binned contigs.'
                )
            raise ValueError(
                'No bins generated - this is one of the expected results when DAS-Tool cannot optimize the input bins, and not necessarily an error. KBase is aware of the issue where DAS-Tool runs successfully but does not produce any output set of optimized bins.'
            )
        else:
            self.rename_and_standardize_bin_names()
            self.make_binned_contig_summary_file_for_binning_apps(task_params)

            generate_binned_contig_param = {
                'file_directory':
                os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY,
                             "das_tool_output_dir_DASTool_bins"),
                'assembly_ref':
                params.get('assembly_ref'),
                'binned_contig_name':
                params.get('output_binned_contig_name'),
                'workspace_name':
                params.get('workspace_name')
            }

            binned_contig_obj_ref = self.mgu.file_to_binned_contigs(
                generate_binned_contig_param).get('binned_contig_obj_ref')

            reportVal = self.generate_report(binned_contig_obj_ref, params)

            returnVal = {
                'result_directory':
                os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY),
                'binned_contig_obj_ref':
                binned_contig_obj_ref
            }

            returnVal.update(reportVal)

        return returnVal
예제 #10
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.au = AssemblyUtil(config.callbackURL)
        self.dfu = DataFileUtil(self.cfg.callbackURL)
        self.gi = GenomeInterface(self.cfg)
        self.taxon_wsname = self.cfg.raw['taxon-workspace-name']
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        mod_match = re.search(r'module-version:\n\W+(.+)\n', yml_text)
        if mod_match:
            self.version = mod_match.group(1)
        else:
            self.version = None
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR',
                           'start_codon', 'stop_codon', 'region', 'chromosome',
                           'scaffold')
        self.spoof_gene_count = 0
        self.is_phytozome = False
        self.is_metagenome = False
        self.strict = True
        self.generate_genes = False
        self.warnings = []  # type: list
        self.feature_dict = collections.OrderedDict()  # type: dict
        self.cdss = set()  # type: set
        self.ontologies_present = collections.defaultdict(dict)  # type: dict
        self.ontology_events = list()  # type: list
        self.skiped_features = collections.Counter(
        )  # type: collections.Counter
        self.feature_counts = collections.Counter(
        )  # type: collections.Counter
        self.re_api_url = config.re_api_url

    def warn(self, message):
        self.warnings.append(message)

    def generate_genome_json(self, params):
        # 1) validate parameters
        self._validate_import_file_params(params)
        self.code_table = params.get('genetic_code', 11)
        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)
        # 3) extract out the parameters
        params = self._set_parsed_params(params)
        if params.get('generate_missing_genes'):
            self.generate_genes = True

        # 4) do the upload
        genome = self._gen_genome_json(params, file_paths["gff_file"],
                                       file_paths["fasta_file"])

        return genome, input_directory

    def import_file(self, params):
        self.is_metagenome = params.get('is_metagenome', False)
        if self.is_metagenome:
            ws_datatype = "KBaseMetagenomes.AnnotatedMetagenomeAssembly"
        else:
            ws_datatype = "KBaseGenomes.Genome"

        genome, input_directory = self.generate_genome_json(params)

        json.dump(genome,
                  open(f"{self.cfg.sharedFolder}/{genome['id']}.json", 'w'),
                  indent=4)
        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params.get('metadata', {}),
            'workspace_datatype': ws_datatype,
        })
        feature_types = "\n".join(
            [f"{k}: {v}" for k, v in genome['feature_counts'].items()])
        report_string = (
            f"A genome with {len(genome['contig_ids'])} contigs and the following feature "
            f"types was imported: \n{feature_types}")
        # XXX report_string is unused except for this log
        logging.info(report_string)

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        prefix = ''
        if self.is_metagenome:
            prefix = 'meta'
        details = {
            prefix + 'genome_ref': f'{info[6]}/{info[0]}/{info[4]}',
            prefix + 'genome_info': info
        }

        return details

    def _gen_genome_json(self, params, input_gff_file, input_fasta_file):
        # reading in GFF file
        features_by_contig = self._retrieve_gff_file(input_gff_file)
        contig_ids = set()

        # parse feature information
        fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta")
        for contig in fasta_contigs:
            molecule_type = str(contig.seq.alphabet).replace(
                'IUPACAmbiguous', '').strip('()')
            contig_ids.add(contig.id)
            for feature in features_by_contig.get(contig.id, []):
                self._transform_feature(contig, feature)

        for cid in set(features_by_contig.keys()) - contig_ids:
            self.warn(
                f"Sequence name {cid} does not match a sequence id in the FASTA file."
                f"{len(features_by_contig[cid])} features will not be imported."
            )
            if self.strict:
                raise ValueError(
                    "Every feature sequence id must match a fasta sequence id")
        prot_fasta_path = f"{self.cfg.sharedFolder}/{params['genome_name']}_protein.fasta"
        # if is a metagenome, the following function writes a protein fasta
        self._process_cdss(prot_fasta_path)

        # save assembly file
        '''
        Metagenome Changes:
            if we want to pass more stuff to AssemblyUtil, do here.
        TODO: add flag to save_assembly_from_fasta
        '''
        if self.is_metagenome:
            genome_type = "metagenome"
        else:
            genome_type = params.get('genome_type', 'isolate')
        if params.get('existing_assembly_ref'):
            assembly_ref = params['existing_assembly_ref']

            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]

            assembly_obj_type = ret['info'][2].split('-')[0]
            valid_assembly_types = [
                "KBaseGenomeAnnotations.Assembly", "KBaseGenomes.ContigSet"
            ]
            if assembly_obj_type not in valid_assembly_types:
                raise ValueError(
                    f"{assembly_ref} is not a reference to an assembly")

            assembly_data = ret['data']
            # should do more thorough check of sequences.
            if not validate_lists_have_same_elements(
                    assembly_data['contigs'].keys(), contig_ids):
                raise ValueError(
                    f"provided assembly with ref {assembly_ref} does not "
                    "have matching contig ids to provided input fasta.")

            logging.info(f"Using supplied assembly: {assembly_ref}")

        else:
            assembly_ref = self.au.save_assembly_from_fasta({
                'file': {
                    'path': input_fasta_file
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                params['genome_name'] + ".assembly",
                'type':
                genome_type,
            })
            assembly_data = self.dfu.get_objects({
                'object_refs': [assembly_ref],
                'ignore_errors': 0
            })['data'][0]['data']

        # generate genome info
        genome = self._gen_genome_info(assembly_ref, assembly_data,
                                       input_gff_file, molecule_type,
                                       prot_fasta_path, params)

        if self.spoof_gene_count > 0:
            self.warn(warnings['spoofed_genome'].format(self.spoof_gene_count))
            genome['suspect'] = 1

        if self.warnings:
            genome['warnings'] = self.warnings

        return genome

    @staticmethod
    def _location(in_feature):
        in_feature['strand'] = in_feature['strand'].replace(
            "-1", "-").translate(strand_table)
        if in_feature['strand'] == '+':
            start = in_feature['start']
        elif in_feature['strand'] == '-':
            start = in_feature['end']
        else:
            raise ValueError('Invalid feature strand: {}'.format(
                in_feature['strand']))
        return [
            in_feature['contig'], start, in_feature['strand'],
            in_feature['end'] - in_feature['start'] + 1
        ]

    @staticmethod
    def _validate_import_file_params(params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(f'Required "{key}" field must be a map/dict')
            sources = ('path', 'shock_id')
            n_valid_fields = sum(1 for f in sources if file.get(f))
            print(f"inputs: {n_valid_fields}")
            if n_valid_fields < 1:
                raise ValueError(
                    f'Required "{key}" field must include one source: '
                    f'{", ".join(sources)}')
            if n_valid_fields > 1:
                raise ValueError(
                    f'Required "{key}" field has too many sources specified: '
                    f'{", ".join(file.keys())}')
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(
                    "Invalid genetic code specified: {}".format(params))

    def _set_parsed_params(self, params):
        logging.info('Setting params')

        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'source': 'User',
            'release': None,
            'metadata': {},
            'source_id': 'unknown',
        }
        default_params.update(params)
        logging.info(json.dumps(default_params, indent=1))
        return default_params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            '''
            below seems like weird if statement
            '''
            if file.get('path') is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                logging.info(
                    f'Moving file from {local_file_path} to {file_path}')
                # Metagenome Updates
                # not sure if we have to be careful about moving the objects
                # around
                if os.path.isfile(local_file_path):
                    shutil.copy2(local_file_path, file_path)
                else:
                    raise FileNotFoundError(
                        f"Input {key} file {local_file_path} not found")
                err_msg = "Shutil copy unsucessful"

            elif file.get('shock_id') is not None:
                # handle shock file
                logging.info(f'Downloading file from SHOCK node: '
                             f'{self.cfg.sharedFolder}-{file["shock_id"]}')
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)
                err_msg = "Shock retrieval"
            # extract the file if it is compressed
            '''
            Metagenome Changes:
            may have to make check here to see if the the file is too big for
            working dir.
            '''
            if file_path is not None:
                logging.info("staged input file =" + file_path)
                sys.stdout.flush()
                if not os.path.isfile(file_path):
                    raise FileNotFoundError(f"{file_path} not a file")
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
                err_msg = "DataFielUtil 'unpack_file' function call"
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

            if not os.path.isfile(file_path):
                raise ValueError(f"{err_msg} for {key} file to {file_path}")

        return file_paths

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file

        """
        logging.info("Reading GFF file")

        feature_list = collections.defaultdict(list)  # type: dict
        is_patric = 0
        '''
        Metagenome Changes:
            the lines below iterate through the entire gff input file, which
            for a Metagenome may be an issue.

            ! Only a problem if there are space limits on processing in this
              request
        '''
        for current_line in open(input_gff_file):
            if current_line.isspace(
            ) or current_line == "" or current_line.startswith("#"):
                continue

            # Split line
            try:
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')
            except ValueError:
                raise ValueError(f"unable to parse {current_line}")
            ''' Do Metagenomes need this phytozome/PATRIC stuff??'''
            # Checking to see if Phytozome
            if "phytozome" in source_id.lower():
                self.is_phytozome = True

            # Checking to see if Phytozome
            if "PATRIC" in source_id:
                is_patric = True

            # PATRIC prepends their contig ids with some gibberish
            if is_patric and "|" in contig_id:
                contig_id = contig_id.split("|", 1)[1]

            # Populating basic feature object
            ftr: dict = {
                'contig': contig_id,
                'source': source_id,
                'type': feature_type,
                'start': int(start),
                'end': int(end),
                'score': score,
                'strand': strand,
                'phase': phase,
                'attributes': collections.defaultdict(list)
            }

            # Populating with attribute key-value pair
            # This is where the feature id is from
            for attribute in attributes.split(";"):
                attribute = attribute.strip()

                # Sometimes empty string
                if not attribute:
                    continue

                # Use of 1 to limit split as '=' character can also be made available later
                # Sometimes lack of "=", assume spaces instead
                if "=" in attribute:
                    key, value = attribute.split("=", 1)

                elif " " in attribute:
                    key, value = attribute.split(" ", 1)

                else:
                    logging.debug(f'Unable to parse {attribute}')
                    continue

                ftr['attributes'][make_snake_case(key)].append(
                    parse.unquote(value.strip('"')))

            ftr['attributes']['raw'] = attributes
            if "id" in ftr['attributes']:
                ftr['ID'] = ftr['attributes']['id'][0]
            if "parent" in ftr['attributes']:
                ftr['Parent'] = ftr['attributes']['parent'][0]

            feature_list[contig_id].append(ftr)

        # Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        # Most bacterial files have only CDSs
        # In order to work with prokaryotic and eukaryotic gene structure synonymously
        # Here we add feature dictionaries representing the parent gene and mRNAs
        # feature_list = self._add_missing_parents(feature_list)

        # Phytozome has the annoying habit of editing their identifiers so we fix them
        if self.is_phytozome:
            self._update_phytozome_features(feature_list)

        # All identifiers need to be checked so that they follow the same general rules
        # Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):
        logging.info("Adding missing identifiers")
        # General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list:
            for i, feat in enumerate(feature_list[contig]):
                if "ID" not in feature_list[contig][i]:
                    # all of the following are not guaranteed to be unique ID's
                    # for key in ("transcriptid", "proteinid", "pacid",
                    #             "parent", "name", 'transcript_id'):
                    for key in ("protein_id", "name", "pacid", "parent"):
                        if key in feature_list[contig][i]['attributes']:
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i]['attributes'][key][0]
                            break
                    if feat['type'] not in self.skip_types:
                        self.feature_counts[feat['type']] += 1

                    # If the process fails, throw an error
                    if "ID" not in feature_list[contig][i]:
                        feat[
                            'ID'] = f"{feat['type']}_{self.feature_counts[feat['type']]}"
        return feature_list

    def _add_missing_parents(self, feature_list):

        # General rules is if CDS or RNA missing parent, add them
        for contig in feature_list:
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ftrs[i]["type"] in self.skip_types:
                    continue
                if "Parent" not in ftrs[i]:
                    # Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if "RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]:
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if "CDS" in ftrs[i]["type"]:
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    @staticmethod
    def _update_phytozome_features(feature_list):

        # General rule is to use the "Name" field where possible
        # And update parent attribute correspondingly
        for contig in feature_list:
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                # Maintain old_id for reference
                # Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("id", "pacid"):
                    if key in feature_list[contig][i]['attributes']:
                        old_id = feature_list[contig][i]['attributes'][key][0]
                        break
                if old_id is None:
                    continue

                # Retain old_id
                feature_position_dict[old_id] = i

                # Clip off the increment on CDS IDs so fragments of the same
                # CDS share the same ID
                if "CDS" in feature_list[contig][i]["ID"]:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "ID"].rsplit('.', 1)[0]

                # In Phytozome, gene and mRNA have "Name" field, CDS do not
                if "name" in feature_list[contig][i]['attributes']:
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        'attributes']['name'][0]

                if "Parent" in feature_list[contig][i]:
                    # Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        # General rules:
        # 1) Genes keep identifier
        # 2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        # 3) CDS always uses RNA identifier with ".CDS" appended

        mRNA_parent_dict = dict()

        for contig in feature_list:
            for ftr in feature_list[contig]:
                if ftr["type"] in self.skip_types:
                    continue
                if "Parent" in ftr:
                    # Retain old_id of parents
                    old_id = ftr["ID"]

                    if ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]:
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    # link old to new ids for mRNA to use with CDS
                    if "RNA" in ftr["type"]:
                        mRNA_parent_dict[old_id] = ftr["ID"]

        return feature_list

    def _check_location_order(self, locations):
        """If order looks good return None.
           If out of order return warning
           If on multiple strands return warning"""
        strand = None
        last_start = 0
        for location in locations:
            if strand is None:
                strand = location[2]
            elif strand != location[2]:
                return warnings["both_strand_coordinates"]
        if strand == "-":
            locations = reversed(locations)
        for location in locations:
            if last_start > location[1]:
                return warnings["out_of_order"]
            else:
                last_start = location[1]
        return None

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(
                "{} is not a supported ontology".format(ontology_type))

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = collections.defaultdict(dict)  # type: dict
        db_xrefs = []
        # these are keys are formatted strangely and require special parsing
        for key in ("go_process", "go_function", "go_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        # CATH terms are not distinct from EC numbers so myst be found by key
        for term in feature.get('cath_funfam', []) + feature.get('cath', []):
            for ref in term.split(','):
                ontology['CATH'][ref] = [self._create_ontology_event("CATH")]
                self.ontologies_present['CATH'][ref] = self.ont_mappings[
                    'CATH'].get(ref, '')

        search_keys = [
            'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam',
            'pfam', 'cog', 'go', 'po', 'ko'
        ]
        ont_terms = []  # type: list
        # flatten out into list of values
        for key in search_keys:
            if key in feature:
                ont_terms += [x for y in feature[key] for x in y.split(',')]

        for ref in ont_terms:
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            elif ":" not in ref:
                db_xrefs.append(tuple(["Unknown_Source", ref]))
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))
        return dict(ontology), db_xrefs

    '''
    Metagenome Changes:
        okay looks like this might be the real meat of it
    '''

    def _transform_feature(self, contig, in_feature):
        """Converts a feature from the gff ftr format into the appropriate
        format for a genome object """
        def _aliases(feat):
            keys = ('locus_tag', 'old_locus_tag', 'protein_id',
                    'transcript_id', 'gene', 'ec_number', 'gene_synonym')
            alias_list = []
            for key in keys:
                if key in feat['attributes']:
                    alias_list.extend([(key, val)
                                       for val in feat['attributes'][key]])
            return alias_list

        if in_feature['start'] < 1 or in_feature['end'] > len(contig):
            self.warn(
                f"Feature with invalid location for specified contig: {in_feature}"
            )
            if self.strict:
                raise ValueError(
                    "Features must be completely contained within the Contig in the "
                    f"Fasta file. Feature: in_feature")
            return

        feat_seq = contig.seq[in_feature['start'] -
                              1:in_feature['end']].upper()
        if in_feature['strand'] in {'-', '-1'}:
            feat_seq = feat_seq.reverse_complement()

        # if the feature ID is duplicated (CDS or transpliced gene) we only
        # need to update the location and dna_sequence
        if in_feature.get('ID') in self.feature_dict:
            existing = self.feature_dict[in_feature['ID']]
            existing['location'].append(self._location(in_feature))
            existing['dna_sequence'] = existing.get('dna_sequence',
                                                    '') + str(feat_seq)
            existing['dna_sequence_length'] = len(existing['dna_sequence'])
            return

        # The following is common to all the feature types
        out_feat = {
            "id": in_feature.get('ID'),
            "type": in_feature['type'],
            "location": [self._location(in_feature)],
            "dna_sequence": str(feat_seq),
            "dna_sequence_length": len(feat_seq),
            "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            "warnings": [],
            "flags": [],
        }

        # add optional fields
        if 'note' in in_feature['attributes']:
            out_feat['note'] = in_feature['attributes']["note"][0]
        ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes'])
        if ont:
            out_feat['ontology_terms'] = ont
        aliases = _aliases(in_feature)
        if aliases:
            out_feat['aliases'] = aliases
        if db_xrefs:
            out_feat['db_xrefs'] = db_xrefs
        if 'product' in in_feature['attributes']:
            out_feat['functions'] = in_feature['attributes']["product"]
        if 'product_name' in in_feature['attributes']:
            if "functions" in out_feat:
                out_feat['functions'].extend(
                    in_feature['attributes']["product_name"])
            else:
                out_feat['functions'] = in_feature['attributes'][
                    "product_name"]
        if 'function' in in_feature['attributes']:
            out_feat['functional_descriptions'] = in_feature['attributes'][
                "function"]
        if 'inference' in in_feature['attributes']:
            GenomeUtils.parse_inferences(in_feature['attributes']['inference'])
        if 'trans-splicing' in in_feature['attributes'].get('exception', []):
            out_feat['flags'].append('trans_splicing')
        if 'pseudo' in in_feature['attributes'].get('exception', []):
            out_feat['flags'].append('pseudo')
        if 'ribosomal-slippage' in in_feature['attributes'].get(
                'exception', []):
            out_feat['flags'].append('ribosomal_slippage')
        parent_id = in_feature.get('Parent', '')
        if parent_id and parent_id not in self.feature_dict:
            raise ValueError(
                f"Parent ID: {parent_id} was not found in feature ID list.")

        # if the feature is a exon or UTR, it will only be used to update the
        # location and sequence of it's parent, we add the info to it parent
        # feature but not the feature dict
        if in_feature['type'] in self.skip_types:
            if parent_id and in_feature['type'] in {
                    'exon', 'five_prime_UTR', 'three_prime_UTR'
            }:
                parent = self.feature_dict[parent_id]
                if in_feature['type'] not in parent:
                    parent[in_feature['type']] = []
                parent[in_feature['type']].append(out_feat)
            return

        # add type specific features
        elif 'gene' in in_feature['type']:
            out_feat['protein_translation_length'] = 0
            out_feat['cdss'] = []

        elif in_feature['type'] == 'CDS':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'cdss' in parent:  # parent must be a gene
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings[
                                "genes_CDS_child_fails_location_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"].append(warnings[
                            "CDS_fail_child_of_gene_coordinate_validation"].
                                                    format(parent_id))
                    parent['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                else:  # parent must be mRNA
                    if not is_parent(parent, out_feat):
                        parent["warnings"] = parent.get('warnings', []) + [
                            warnings["mRNA_fail_parent_coordinate_validation"].
                            format(out_feat["id"])
                        ]
                        out_feat["warnings"].append(warnings[
                            "CDS_fail_child_of_mRNA_coordinate_validation"].
                                                    format(parent_id))
                    parent['cds'] = in_feature['ID']
                    out_feat['parent_mrna'] = parent_id
                    parent_gene = self.feature_dict[parent['parent_gene']]
                    parent_gene['cdss'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent['parent_gene']
            # keep track of CDSs for post processing
            self.cdss.add(out_feat['id'])

        elif in_feature['type'] == 'mRNA':
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'mrnas' not in parent:
                    parent['mrnas'] = []
                if 'cdss' in parent:  # parent must be a gene
                    parent['mrnas'].append(in_feature['ID'])
                    out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings["genes_mRNA_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"].append(
                        warnings["mRNAs_parent_gene_fails_location_validation"]
                        .format(parent_id))

        else:
            out_feat["type"] = in_feature['type']
            # this prevents big misc_features from blowing up the genome size
            if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
                del out_feat['dna_sequence']
            if parent_id:
                parent = self.feature_dict[parent_id]
                if 'children' not in parent:
                    parent['children'] = []
                parent['children'].append(out_feat['id'])
                out_feat['parent_gene'] = parent_id
                if not is_parent(parent, out_feat):
                    parent["warnings"] = parent.get('warnings', []) + [
                        warnings[
                            "generic_parents_child_fails_location_validation"].
                        format(out_feat["id"])
                    ]
                    out_feat["warnings"].append(warnings[
                        "generic_childs_parent_fails_location_validation"].
                                                format(parent_id))

        # cleanup empty optional arrays
        for key in ['warnings', 'flags']:
            if not out_feat[key]:
                del out_feat[key]

        self.feature_dict[out_feat['id']] = out_feat

    def _process_cdss(self, prot_fasta_path):
        """Because CDSs can have multiple fragments, it's necessary to go
        back over them to calculate a final protein sequence"""
        if self.is_metagenome:
            prot_fasta = {}  # type: dict
            untranslatable_prot = set()
        for cds_id in self.cdss:
            cds = self.feature_dict[cds_id]
            try:
                prot_seq = str(
                    Seq(cds['dna_sequence']).translate(self.code_table,
                                                       cds=True).strip("*"))
            except TranslationError as e:
                cds['warnings'] = cds.get('warnings', []) + [str(e)]
                # NOTE: we may need a different way of handling this for metagenomes.
                prot_seq = ""
                if self.is_metagenome:
                    untranslatable_prot.add(cds_id)

            if self.is_metagenome:
                if prot_seq != "":
                    protein_id = ""
                    if cds.get("aliases"):
                        aliases = cds['aliases']
                        for key, val in aliases:
                            if key == "protein_id":
                                protein_id = val
                        if not protein_id:
                            protein_id = cds['id']  # assign to some default
                    else:
                        # log a warning here?
                        pass
                    # TODO: update header to reflect what we actually want people
                    # to see.
                    if protein_id in prot_fasta:
                        prot_fasta[protein_id][0] += "|" + cds['id']
                    else:
                        fasta_seq_data = ">" + protein_id + " cds_ids:" + cds[
                            'id']
                        prot_fasta[protein_id] = [fasta_seq_data, prot_seq]
                else:
                    pass

            else:
                cds.update({
                    "protein_translation":
                    prot_seq,
                    "protein_md5":
                    hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
                    "protein_translation_length":
                    len(prot_seq),
                })

            if 'parent_gene' in cds:
                parent_gene = self.feature_dict[cds['parent_gene']]
                # no propigation for now
                propagate_cds_props_to_gene(cds, parent_gene,
                                            self.is_metagenome)
            elif self.generate_genes:
                spoof = copy.copy(cds)
                spoof['type'] = 'gene'
                spoof['id'] = cds['id'] + "_gene"
                spoof['cdss'] = [cds['id']]
                spoof['warnings'] = [
                    warnings['spoofed_gene'].format(cds['id'])
                ]
                self.feature_dict[spoof['id']] = spoof
                cds['parent_gene'] = spoof['id']
                self.spoof_gene_count += 1
            else:
                raise ValueError(warnings['no_spoof'])

            self.feature_dict[cds['id']] = cds

        if self.is_metagenome:
            with open(prot_fasta_path, 'w') as fid:
                for key, line in prot_fasta.items():
                    fid.write('\n'.join(line))
            # do something with 'untranslatable_prot'

    def _update_from_exons(self, feature):
        """This function updates the sequence and location of a feature based
            on it's UTRs, CDSs and exon information"""

        # note that start and end here are in direction of translation
        def start(loc):
            return loc[0][1]

        def end(loc):
            if loc[-1][2] == "+":
                return loc[-1][1] + loc[-1][3] + 1
            else:
                return loc[-1][1] - loc[-1][3] - 1

        if 'exon' in feature:
            # update the feature with the exon locations and sequences
            feature['location'] = [x['location'][0] for x in feature['exon']]
            feature['dna_sequence'] = "".join(x['dna_sequence']
                                              for x in feature['exon'])
            feature['dna_sequence_length'] = len(feature['dna_sequence'])

        # construct feature location from utrs and cdss if present
        elif 'cds' in feature:
            cds = [copy.deepcopy(self.feature_dict[feature['cds']])]
            locs = []  # type: list
            seq = ""
            for frag in feature.get('five_prime_UTR', []) + cds + \
                    feature.get('three_prime_UTR', []):

                # merge into last location if adjacent
                if locs and abs(end(locs) - start(frag['location'])) == 1:
                    # extend the location length by the length of the first
                    # location in the fragment
                    first = frag['location'].pop(0)
                    locs[-1][3] += first[3]

                locs.extend(frag['location'])
                seq += frag['dna_sequence']

            feature['location'] = locs
            feature['dna_sequence'] = seq
            feature['dna_sequence_length'] = len(seq)

        # remove these properties as they are no longer needed
        for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']:
            feature.pop(x, None)

        else:
            ValueError(
                'Feature {feature["id"]} must contain either exon or cds data to '
                'construct an accurate location and sequence')

    def _gen_genome_info(self, assembly_ref, assembly, input_gff_file,
                         molecule_type, prot_fasta_path, params):
        """
        _gen_genome_info: generate genome info
        Here is the meat of the saving operation.

        Genome Fields:
            features: protein encoding genes
            cdss:
            mrnas: mrna sequences
            non_coding_features: everything that doesn't fall into 'features',
                'cdss', 'mrnas'
        """
        features = []
        cdss = []
        mrnas = []
        non_coding_features = []
        genome = {
            "id": params.get('genome_name'),
            "scientific_name": params.get('scientific_name', "Unknown"),
            "assembly_ref": assembly_ref,
            'molecule_type': molecule_type,
            "gc_content": assembly["gc_content"],
            "dna_size": assembly["dna_size"],
            'md5': assembly['md5'],
            'num_contigs': len(assembly['contigs']),
            'ontologies_present': dict(self.ontologies_present),
            'ontology_events': self.ontology_events,
        }
        if self.is_metagenome:
            metagenome_fields = [
                ("publications", []),
                ("external_source_origination_date", None),
                ("original_source_file_name", None),
                ("notes", None),
                # NOTE: in the future environment should use an ontology.
                ("environment", None),
            ]  # type: list
            for field, default in metagenome_fields:
                genome[field] = params.get(field, default)

            # save protein fasta to shock
            prot_to_shock = self.dfu.file_to_shock({
                'file_path': prot_fasta_path,
                'make_handle': 1,
                'pack': 'gzip'
            })
            genome['protein_handle_ref'] = prot_to_shock['handle']['hid']

        genome['contig_ids'], genome['contig_lengths'] = zip(
            *[(k, v['length']) for k, v in assembly['contigs'].items()])

        if self.is_metagenome:
            genome['source'], _ = self.gi.determine_tier(params.get('source'))
        else:
            genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
                params.get('source'))

        # Set taxonomy-related fields in the genome data
        if params.get('taxon_id'):
            GenomeUtils.set_taxon_data(int(params['taxon_id']),
                                       self.re_api_url, genome)
        else:
            GenomeUtils.set_default_taxon_data(genome)

        # handle optional fields
        for key in ('release', 'genetic_code', 'genome_type', 'source_id'):
            if params.get(key):
                genome[key] = params[key]

        # Phytozome gff files are not compatible with the RNASeq Pipeline
        # so it's better to build from the object than cache the file
        if self.is_phytozome or self.is_metagenome:
            gff_file_to_shock = self.dfu.file_to_shock({
                'file_path': input_gff_file,
                'make_handle': 1,
                'pack': "gzip"
            })
            genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid']

        for feature in self.feature_dict.values():
            self.feature_counts[feature['type']] += 1
            if 'exon' in feature or feature['type'] == 'mRNA':
                self._update_from_exons(feature)

            # Test if location order is in order.
            is_transpliced = "flags" in feature and "trans_splicing" in feature[
                "flags"]
            if not is_transpliced and len(feature["location"]) > 1:
                # Check the order only if not trans_spliced and has more than 1 location.
                location_warning = self._check_location_order(
                    feature["location"])
                if location_warning is not None:
                    feature["warnings"] = feature.get('warnings',
                                                      []) + [location_warning]

            contig_len = genome["contig_lengths"][genome["contig_ids"].index(
                feature["location"][0][0])]
            feature = check_full_contig_length_or_multi_strand_feature(
                feature, is_transpliced, contig_len, self.skip_types)

            # sort features into their respective arrays
            if feature['type'] == 'CDS':
                if not self.is_metagenome:
                    del feature['type']
                cdss.append(feature)
            elif feature['type'] == 'mRNA':
                if not self.is_metagenome:
                    del feature['type']
                mrnas.append(feature)
            elif feature['type'] == 'gene':
                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in feature:
                        feature[key] = list(set(feature[key]))
                if feature['cdss']:
                    if not self.is_metagenome:
                        del feature['type']
                    self.feature_counts["protein_encoding_gene"] += 1
                    features.append(feature)
                else:
                    feature.pop('mrnas', None)
                    feature.pop('cdss', None)
                    feature.pop('protein_translation_length', None)
                    self.feature_counts["non_coding_gene"] += 1
                    non_coding_features.append(feature)
            else:
                non_coding_features.append(feature)

        # if input is metagenome, save features, cdss, non_coding_features, and
        # mrnas to shock
        if self.is_metagenome:
            # TODO: make this section more efficient by editing the above.
            metagenome_features = features + cdss + mrnas + non_coding_features
            genome['num_features'] = len(metagenome_features)
            genome_name = params['genome_name']
            json_file_path = f'{self.cfg.sharedFolder}/{genome_name}_features.json'
            # save to json files first
            with open(json_file_path, 'w') as fid:
                json.dump(metagenome_features, fid)
            # write json to shock
            json_to_shock = self.dfu.file_to_shock({
                'file_path': json_file_path,
                'make_handle': 1,
                'pack': 'gzip'
            })
            self.feature_counts["non_coding_features"] = len(
                non_coding_features)
            genome['features_handle_ref'] = json_to_shock['handle']['hid']
            # remove json file to avoid disk overload
            os.remove(json_file_path)
            # delete python objects to reduce overhead
            del metagenome_features
            del features, cdss, mrnas, non_coding_features
        else:
            # TODO determine whether we want to deepcopy here instead of reference.
            genome['features'] = features
            genome['cdss'] = cdss
            genome['mrnas'] = mrnas
            genome['non_coding_features'] = non_coding_features
            self.feature_counts["non_coding_features"] = len(
                genome['non_coding_features'])
        if self.warnings:
            genome['warnings'] = self.warnings
        genome['feature_counts'] = dict(self.feature_counts)
        return genome
예제 #11
0
class EukrepUtil:
    EUKREP_BASE_PATH = '/Eukrep'
    EUKREP_RESULT_DIRECTORY = 'eukrep_output_dir'
    MAPPING_THREADS = 16
    BBMAP_MEM = '30g'

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)

    def _validate_run_eukrep_params(self, task_params):
        """
        _validate_run_eukrep_params:
                validates params passed to run_eukrep method
        """
        log('Start validating run_eukrep params')

        # check for required parameters
        for p in ['assembly_ref', 'workspace_name']:
            if p not in task_params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        os.chdir(self.scratch)
        log('Start executing command:\n{}'.format(command))
        log('Command is running from:\n{}'.format(self.scratch))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(
                exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contig file from GenomeAssembly object
        """
        contig_file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref
        }).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path':
                                            contig_file})['file_path']

        return contig_file

    def retrieve_assembly(self, task_params):
        if os.path.exists(task_params['contig_file_path']):
            assembly = task_params['contig_file_path']
            print("FOUND ASSEMBLY ON LOCAL SCRATCH")
        else:
            # we are on njsw so lets copy it over to scratch
            assembly = self._get_contig_file(task_params['assembly_ref'])
        return assembly

    def generate_eukrep_command(self, assembly_ref, euk_output_assembly_ref,
                                noneuk_output_assembly_ref):
        """
        generate_command: eukrep
        """

        log("\n\nRunning generate_jorg_command")
        command = 'EukRep '
        command += '-i {} '.format(assembly_ref)
        command += '-o {} '.format(euk_output_assembly_ref)
        command += '--prokarya {} '.format(noneuk_output_assembly_ref)
        log('Generated eukrep command: {}'.format(command))

        return command

    def generate_html_report(self):
        """
        generate_html_report: generate html summary report
        """

        log('Start generating html report')
        #html_report = list()

        output_directory = os.path.join(self.scratch,
                                        'html_dir_' + str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        # get summary data from existing assembly object and bins_objects
        Summary_Table_Content = ''
        Overview_Content = ''

        # generate overview content

        # Example
        # Overview_Content += '<p>Input contigs: {}</p>'.format(input_contig_count)

        Overview_Content += '<p>Iteration Selected: 3</p>'
        Overview_Content += '<p>Number of contigs: 4</p>'
        Overview_Content += '<p>Circularized genome: No </p>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                report_template = report_template.replace(
                    'Summary_Table_Content', Summary_Table_Content)
                result_file.write(report_template)

        # save html dir to shock
        def dir_to_shock(dir_path, name, description):
            '''
            For regular directories or html directories

            name - for regular directories: the name of the flat (zip) file returned to ui
                   for html directories: the name of the html file
            '''
            dfu_fileToShock_ret = self.dfu.file_to_shock({
                'file_path': dir_path,
                'make_handle': 0,
                'pack': 'zip',
            })

            dir_shockInfo = {
                'shock_id': dfu_fileToShock_ret['shock_id'],
                'name': name,
                'description': description
            }

            return dir_shockInfo

        html_shockInfo = dir_to_shock(output_directory, 'report.html',
                                      'HTML report for Eukrep')
        """
        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for kb_concoct App'})

        return html_report
        """

        return [html_shockInfo]

    def generate_report(self, params):
        """
        generate_report: generate summary report

        """
        log('Generating report')

        output_html_files = self.generate_html_report()

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 500,
            'report_object_name': 'kb_eukrep_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def run_eukrep(self, task_params):
        """
        run_eukrep: eukrep app

        required params:
            assembly_ref: Metagenome assembly object reference
            binned_contig_name: BinnedContig object name and output file header
            workspace_name: the name of the workspace it gets saved to.
            reads_file: list of reads object (PairedEndLibrary/SingleEndLibrary)
            upon which EUKREP will be run

        optional params:
            TBD

            ref: https://github.com/BinPro/EUKREP/blob/develop/README.md
        """
        log('--->\nrunning EukrepUtil.run_eukrep\n' +
            'task_params:\n{}'.format(json.dumps(task_params, indent=1)))

        self._validate_run_eukrep_params(task_params)

        # get assembly
        contig_file = self._get_contig_file(task_params['assembly_ref'])
        task_params['contig_file_path'] = contig_file

        # clean the assembly file so that there are no spaces in the fasta headers
        assembly = self.retrieve_assembly(task_params)
        task_params['contig_file_path'] = assembly

        assembly_ref = task_params['contig_file_path']
        euk_output_assembly_ref = assembly_ref.rsplit('.',
                                                      1)[0] + "_eukaryote.fna"
        noneuk_output_assembly_ref = assembly_ref.rsplit(
            '.', 1)[0] + "_noneukaryote.fna"

        command = self.generate_eukrep_command(assembly_ref,
                                               euk_output_assembly_ref,
                                               noneuk_output_assembly_ref)
        self._run_command(command)

        dest = os.path.abspath(self.scratch)

        euk_output_assembly_ref = os.path.basename(euk_output_assembly_ref)
        noneuk_output_assembly_ref = os.path.basename(
            noneuk_output_assembly_ref)

        log("assembly_ref is {}".format(assembly_ref))
        log("euk_output_assembly_ref is {}".format(euk_output_assembly_ref))
        log("noneuk_output_assembly_ref is {}".format(
            noneuk_output_assembly_ref))

        euk_output_assembly_ref_object = self.au.save_assembly_from_fasta({
            'file': {
                'path': dest + '/' + euk_output_assembly_ref
            },
            'workspace_name':
            task_params['workspace_name'],
            'assembly_name':
            euk_output_assembly_ref
        })

        noneuk_output_assembly_ref_object = self.au.save_assembly_from_fasta({
            'file': {
                'path': dest + '/' + noneuk_output_assembly_ref
            },
            'workspace_name':
            task_params['workspace_name'],
            'assembly_name':
            noneuk_output_assembly_ref
        })

        # generate report
        reportVal = self.generate_report(task_params)
        returnVal = {
            'euk_output_assembly_ref': euk_output_assembly_ref_object,
            'noneuk_output_assembly_ref': noneuk_output_assembly_ref_object
        }
        returnVal.update(reportVal)
        #
        return reportVal
예제 #12
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch, ws_url):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)
        self.ws = Workspace(ws_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"

    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print(f'parsing FASTA file: {fasta_file_path}')
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp')
        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)
        json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w'))

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info

    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        """ construct the WS object data to save based on the parsed info and params """
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        fasta_file_handle_info['handle'] = fasta_file_handle_info['handle']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0]
            assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}'

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return sort_dict(assembly_data)

    def parse_fasta(self, fasta_file_path, params):
        """ Do the actual work of inspecting each contig """

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This FASTA file may have amino acids in it instead '
                                         'of the required nucleotides.')
                    raise ValueError(f"This FASTA file has non nucleic acid characters: "
                                     f"{character}")

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence.encode()).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The FASTA header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')

            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data

    @staticmethod
    def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter '
              f'than {(min_contig_length)} bp.')

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        if len(obj_data["contigs"]) == 0:
            raise ValueError('There are no contigs to save, thus there is no valid assembly.')
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info

    def save_fasta_file_to_shock(self, fasta_file_path):
        """ Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        """
        print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and returning the path to the file"""
        file_path = None
        if 'file' in params:
            if not os.path.isfile(params['file']['path']):
                raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.')
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print(f'Downloading file from SHOCK node: {params["shock_id"]}')
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print(f'Downloading file from: {params["ftp_url"]}')
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid FASTA could be extracted based on the input parameters')


    @staticmethod
    def validate_params(params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one FASTA file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')