Exemplo n.º 1
0
def upload_interleaved_reads(callback_url, reads_file, ws_name, reads_obj_name,
                             source_reads_upa):
    """
    callback_url = as usual.
    reads_file = full path to the reads file to upload
    ws_name = the workspace to use for uploading the reads file
    reads_obj_name = the name of the new reads object to save as
    source_reads = if not None, the source UPA for the original reads file.
    """
    # unfortunately, the ReadsUtils only accepts uncompressed fq files- this should
    # be fixed on the KBase side
    dfu = DataFileUtil(callback_url)
    reads_unpacked = dfu.unpack_file({'file_path': reads_file})['file_path']

    ru = ReadsUtils(callback_url)
    new_reads_upa = ru.upload_reads({
        'fwd_file': reads_unpacked,
        'interleaved': 1,
        'wsname': ws_name,
        'name': reads_obj_name,
        'source_reads_ref': source_reads_upa
    })['obj_ref']
    print('saved ' + str(reads_unpacked) + ' to ' + str(new_reads_upa))
    return new_reads_upa
Exemplo n.º 2
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"


    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print('parsing FASTA file: ' + str(fasta_file_path))
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' +
              str(assembly_data['dna_size']) + 'bp')

        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info


    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        ''' construct the WS object data to save based on the parsed info and params '''
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            assembly_data['taxon_ref'] = params['taxon_ref']

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return assembly_data


    def parse_fasta(self, fasta_file_path, params):
        ''' Do the actual work of inspecting each contig '''

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This fasta file may have amino acids in it instead ' +
                                         'of the required nucleotides.')
                    raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character))

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The fasta header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')
            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list))).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data


    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' +
              str(min_contig_length) + 'bp.')


    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        ''' removes all contigs less than the min_contig_length provided '''
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path


    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info


    def save_fasta_file_to_shock(self, fasta_file_path):
        ''' Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        '''
        print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})


    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and returning the path to the file'''
        file_path = None
        if 'file' in params:
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print('Downloading file from SHOCK node: ' + str(params['shock_id']))
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print('Downloading file from: ' + str(params['ftp_url']))
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid fasta could be extracted based on the input parameters')


    def validate_params(self, params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one fasta file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
Exemplo n.º 3
0
    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and uncompressing if needed. '''

        # construct the input directory where we stage files
        input_directory =  os.path.join(self.cfg.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4()))
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if 'path' in file and file['path'] is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            print('Downloading file from SHOCK node: ' + str(self.cfg.shockURL) + ' - ' + str(file['shock_id']))
            sys.stdout.flush()
            dfUtil = DataFileUtil(self.cfg.callbackURL)
            file_name = dfUtil.shock_to_file({
                                    'file_path': input_directory,
                                    'shock_id': file['shock_id']
                                })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            # Note that the Transform originally had a script_utils.download_from_urls method
            # that, if the url is a folder, pulls all subfiles.  That code recently broke when
            # fetching from NCBI (not clear if it is our issue or NCBI), but for now just
            # support the most common case- an FTP to a single file.
            print('Downloading file from: ' + str(file['ftp_url']))
            sys.stdout.flush()

            url = urlparse(file['ftp_url'])
            if url.scheme != 'ftp' and url.scheme != 'http':
                raise ValueError('Only FTP/HTTP servers are supported')
            file_name = 'genome.gbk'
            if url.path != '':
                file_name = url.path.split('/')[-1]

            req = urllib2.Request(file['ftp_url'])
            response = urllib2.urlopen(req)
            file_data = response.read()

            genbank_file_path = os.path.join(input_directory, file_name)
            with open(genbank_file_path, "w") as genbank_file:
                genbank_file.write(file_data)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            print("staged input file =" + genbank_file_path)
            sys.stdout.flush()
            dfUtil = DataFileUtil(self.cfg.callbackURL)
            dfUtil.unpack_file({ 'file_path': genbank_file_path })

        else:
            raise ValueError('No valid files could be extracted based on the input')

        return input_directory
Exemplo n.º 4
0
    def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params):
        """
        :param params: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Params"
           (KButil_Build_InSilico_Metagenomes_with_Grinder() ** **  Use
           Grinder to generate in silico shotgun metagenomes) -> structure:
           parameter "workspace_name" of type "workspace_name" (** The
           workspace object refs are of form: ** **    objects =
           ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "input_refs" of type "data_obj_ref", parameter
           "output_name" of type "data_obj_name", parameter "desc" of String,
           parameter "num_reads_per_lib" of Long, parameter
           "population_percs" of String, parameter "read_len_mean" of Long,
           parameter "read_len_stddev" of Double, parameter "pairs_flag" of
           Long, parameter "mate_orientation" of String, parameter
           "insert_len_mean" of Long, parameter "insert_len_stddev" of
           Double, parameter "mutation_dist" of String, parameter
           "mutation_ratio" of String, parameter "qual_good" of Long,
           parameter "qual_bad" of Long, parameter "len_bias_flag" of Long,
           parameter "random_seed" of Long
        :returns: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder

        #### STEP 0: basic init
        ##
        console = []
        invalid_msgs = []
        report_text = ''
        self.log(console,
                 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ')
        self.log(console, "\n" + pformat(params))

        # Auth
        token = ctx['token']
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # API Clients
        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'
        wsClient = workspaceService(self.workspaceURL, token=token)
        readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                       token=ctx['token'])  # SDK local
        #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # for SDK local.  local doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.serviceWizardURL,
                               token=ctx['token'])  # for dynamic service
        auClient = AssemblyUtil(self.callbackURL,
                                token=ctx['token'],
                                service_ver=SERVICE_VER)
        dfu = DFUClient(self.callbackURL)

        # param checks
        required_params = [
            'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib',
            'population_percs', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'mate_orientation', 'insert_len_mean',
            'insert_len_stddev', 'mutation_dist', 'mutation_ratio',
            'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # cast to str unpredictable numerical params (mostly used in string context)
        numerical_params = [
            'num_reads_per_lib', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good',
            'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in numerical_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                continue
            params[arg] = str(params[arg])

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        provenance[0]['input_ws_objects'] = []
        for input_ref in params['input_refs']:
            provenance[0]['input_ws_objects'].append(input_ref)

        # set the output paths
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html_output_dir = os.path.join(output_dir, 'html')
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)

        #### STEP 1: Parse population_percs and write to file
        ##
        abundance_str = params['population_percs'].strip()
        abundance_file_path = os.path.join(output_dir, 'my_abundances.txt')
        abundance_config_num_libs = 0
        abundance_config_num_libs_set = False
        grinder_genome_ids = []
        header = []
        out_buf = []

        for row in abundance_str.split("\n"):
            cols = re.split(r'\s+', row)
            if cols[0].upper() == "GENOME":
                for col in cols:
                    if col == '':
                        continue
                    header.append(col)
                continue
            grinder_genome_ids.append(cols[0])
            self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'")  # DEBUG
            out_row = []
            for col in cols:
                if col == '':
                    continue
                elif col == '%':
                    continue
                elif col.endswith('%'):
                    col = col.rstrip('%')
                out_row.append(col)
            out_buf.append("\t".join(out_row))
            num_samples = len(out_row) - 1  # first col is genome id
            if not abundance_config_num_libs_set:
                abundance_config_num_libs_set = True
                abundance_config_num_libs = num_samples
            elif num_samples != abundance_config_num_libs:
                invalid_msgs.append(
                    "inconsistent number of samples in population_percs input field"
                )
        # data validation
        if abundance_config_num_libs == 0:
            invalid_msgs.append(
                "unable to find sample percentages in population_percs input field"
            )
        sample_sums = []
        for row_i, abund_row_str in enumerate(out_buf):
            abund_row = abund_row_str.split()
            for sample_i, abund in enumerate(abund_row[1:]):
                if row_i == 0:
                    sample_sums.append(0)
                #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i))  # DEBUG
                sample_sums[sample_i] += float(abund)
        for sample_i, sample_sum in enumerate(sample_sums):
            if sample_sum < 99.5 or sample_sum > 100.5:
                self.log(
                    invalid_msgs, "Sample: " + str(sample_i + 1) + " " +
                    header[sample_i + 1] +
                    " proportions is not summing to 100.0. Summing to: " +
                    str(sample_sum))

        if len(invalid_msgs) == 0:
            with open(abundance_file_path, 'w') as abundance_fh:
                for out_line in out_buf:
                    abundance_fh.write(out_line + "\n")
            # DEBUG
            with open(abundance_file_path, 'r') as abundance_fh:
                for out_line in abundance_fh.readlines():
                    out_line = out_line.rstrip()
                    self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'")

        #### STEP 2: get genome scaffold sequences
        ##
        if len(invalid_msgs) == 0:
            genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna')
            read_buf_size = 65536
            write_buf_size = 65536
            accepted_input_types = ["KBaseGenomes.Genome"]
            genome_refs = params['input_refs']
            genome_obj_names = []
            genome_sci_names = []
            assembly_refs = []

            for i, input_ref in enumerate(genome_refs):
                # genome obj info
                try:
                    [
                        OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I,
                        SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I,
                        META_I
                    ] = range(11)  # object_info tuple
                    input_obj_info = wsClient.get_object_info_new(
                        {'objects': [{
                            'ref': input_ref
                        }]})[0]
                    input_obj_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        input_obj_info[TYPE_I])  # remove trailing version
                    genome_obj_names.append(input_obj_info[NAME_I])

                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     input_ref + ')' + str(e))
                if input_obj_type not in accepted_input_types:
                    raise ValueError("Input object of type '" +
                                     input_obj_type +
                                     "' not accepted.  Must be one of " +
                                     ", ".join(accepted_input_types))

                # genome obj data
                try:
                    genome_obj = wsClient.get_objects([{
                        'ref': input_ref
                    }])[0]['data']
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " + input_ref)

                # Get assembly_refs
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    self.log(console, msg)
                    self.log(invalid_msgs, msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['contigset_ref'])

        # get fastas for scaffolds
        if len(invalid_msgs) == 0:
            contig_file_paths = []

            for genome_i, input_ref in enumerate(genome_refs):
                contig_file = auClient.get_assembly_as_fasta({
                    'ref':
                    assembly_refs[genome_i]
                }).get('path')
                sys.stdout.flush()
                contig_file_path = dfu.unpack_file({'file_path':
                                                    contig_file})['file_path']
                contig_file_paths.append(contig_file_path)

            # reformat FASTA IDs for Grinder
            with open(genomes_src_db_file_path, 'w',
                      write_buf_size) as genomes_src_db_fh:
                for genome_i, contig_file_path in enumerate(contig_file_paths):
                    #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path)  # DEBUG
                    #contig_ids = []
                    with open(contig_file_path, 'r',
                              read_buf_size) as contig_fh:
                        genome_seq = ''
                        contig_seq = ''
                        contig_seqs = []
                        for contig_line in contig_fh.readlines():
                            contig_line = contig_line.rstrip()
                            if contig_line.startswith('>'):
                                #contig_id = contig_line.strip()[1:].split(' ')[0]
                                #contig_ids.append(contig_id)
                                #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n")
                                if contig_seq != '':
                                    contig_seqs.append(contig_seq)
                                    contig_seq = ''
                                    continue
                            else:
                                #genomes_src_db_fh.write(contig_line)
                                contig_seq += contig_line
                        if contig_seq != '':
                            contig_seqs.append(contig_seq)
                            contig_seq = ''

                    # write joined contigs to file
                    genome_seq = "NNNNNNNNNN".join(
                        contig_seqs
                    )  # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins
                    genome_seq = genome_seq.upper(
                    )  # grinder might require upper case?
                    genomes_src_db_fh.write(">" +
                                            grinder_genome_ids[genome_i] +
                                            "\n")
                    genomes_src_db_fh.write(genome_seq + "\n")
                    genome_seq = ''
                    contig_seqs = []

                    # DEBUG
                    #for contig_id in contig_ids:
                    #    self.log(console, "\tCONTIG_ID: "+contig_id)  # DEBUG
            # DEBUG
            toggle = 0
            with open(genomes_src_db_file_path, 'r',
                      write_buf_size) as genomes_src_db_fh:
                for contig_line in genomes_src_db_fh.readlines():
                    contig_line = contig_line.rstrip()
                    if contig_line.startswith('>'):
                        self.log(console, 'GENOMES_SRC_DB: ' + contig_line)
                        genome_id = contig_line[1:]
                        toggle = 0
                    elif toggle == 0:
                        #elif genome_id == 'G3':
                        self.log(
                            console,
                            'GENOMES_SRC_DB: ' + contig_line[0:50] + '...')
                        toggle += 1

        #### STEP 3: Run Grinder
        ##
        if len(invalid_msgs) == 0:
            cmd = []
            cmd.append(self.GRINDER)
            # output
            cmd.append('-base_name')
            cmd.append(params['output_name'])
            cmd.append('-output_dir')
            cmd.append(output_dir)
            # contigs input
            cmd.append('-reference_file')
            cmd.append(genomes_src_db_file_path)
            # abundances
            cmd.append('-abundance_file')
            cmd.append(abundance_file_path)
            # library size
            cmd.append('-total_reads')
            cmd.append(str(params['num_reads_per_lib']))
            # num libraries (overridden by abundance file?)
            cmd.append('-num_libraries')
            cmd.append(str(abundance_config_num_libs))
            # read and insert lens
            cmd.append('-read_dist')
            cmd.append(str(params['read_len_mean']))
            cmd.append('normal')
            cmd.append(str(params['read_len_stddev']))
            if str(params['pairs_flag']) == '1':
                cmd.append('-insert_dist')
                cmd.append(str(params['insert_len_mean']))
                cmd.append('normal')
                cmd.append(str(params['insert_len_stddev']))
                # mate orientation
                cmd.append('-mate_orientation')
                cmd.append(params['mate_orientation'])
            # genome len bias
            cmd.append('-length_bias')
            cmd.append(str(params['len_bias_flag']))
            # mutation model
            cmd.append('-mutation_dist')
            cmd.append(str(params['mutation_dist']))
            cmd.append('-mutation_ratio')
            cmd.append(str(params['mutation_ratio']))
            # qual scores
            cmd.append('-fastq_output')
            cmd.append('1')
            cmd.append('-qual_levels')
            cmd.append(str(params['qual_good']))
            cmd.append(str(params['qual_bad']))
            # skip contig joins
            cmd.append('-exclude_chars')
            cmd.append('NX')
            # explicitly request bidirectional
            cmd.append('-unidirectional')
            cmd.append('0')
            # random seed
            if 'random_seed' in params and params[
                    'random_seed'] != None and params['random_seed'] != '':
                cmd.append('-random_seed')
                cmd.append(str(params['random_seed']))

            # RUN
            cmd_str = " ".join(cmd)
            self.log(console, "===========================================")
            self.log(console, "RUNNING: " + cmd_str)
            self.log(console, "===========================================")

            cmdProcess = subprocess.Popen(cmd_str,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
            outputlines = []
            while True:
                line = cmdProcess.stdout.readline()
                outputlines.append(line)
                if not line: break
                self.log(console, line.replace('\n', ''))

            cmdProcess.stdout.close()
            cmdProcess.wait()
            self.log(console,
                     'return code: ' + str(cmdProcess.returncode) + '\n')
            if cmdProcess.returncode != 0:
                raise ValueError('Error running kb_grinder, return code: ' +
                                 str(cmdProcess.returncode) + '\n')

            #report_text += "\n".join(outputlines)
            #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr

            # capture output for report and paths to out files
            report_text_buf = []
            struct_file_paths = []
            struct_file_names = []
            fastq_file_paths = []
            for out_line in outputlines:
                out_line = out_line.rstrip()
                if 'Community structure' in out_line:
                    clean_line = out_line.lstrip()
                    struct_file_path = re.split(r'\s+', clean_line)[3]
                    struct_file_paths.append(struct_file_path)
                    struct_file_names.append(struct_file_path.split('/')[-1])
                    self.log(console, "STRUCT_FILE_NAME: '" +
                             struct_file_path.split('/')[-1])  # DEBUG
                elif 'FASTQ file' in out_line:
                    clean_line = out_line.lstrip()
                    fastq_file_paths.append(re.split(r'\s+', clean_line)[3])
                else:
                    report_text_buf.append(out_line)
            report_text += "\n".join(report_text_buf)

        #### STEP 4: Upload Read Libs and create reads set
        ##
        if len(invalid_msgs) == 0:
            lib_obj_refs = []
            lib_obj_names = []
            readsSet_items = []

            for sample_i, fastq_file_path in enumerate(fastq_file_paths):

                if not os.path.isfile (fastq_file_path) \
                   or os.path.getsize (fastq_file_path) == 0:

                    raise ValueError("empty read lib generated: " +
                                     fastq_file_path)
                else:

                    # lib obj name
                    if len(fastq_file_paths) == 0:
                        output_obj_name = params['output_name']
                    else:
                        if str(params['pairs_flag']) == '1':
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".PairedEndLib"
                        else:
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".SingleEndLib"
                    lib_obj_names.append(output_obj_name)

                    # upload lib and get obj ref
                    self.log(
                        console,
                        'Uploading trimmed paired reads: ' + output_obj_name)
                    sequencing_tech = 'artificial reads'
                    if str(params['pairs_flag']) == '1':
                        interleaved = 1
                    else:
                        interleaved = 0
                    lib_obj_ref = readsUtils_Client.upload_reads({
                        'wsname':
                        str(params['workspace_name']),
                        'name':
                        output_obj_name,
                        'fwd_file':
                        fastq_file_path,
                        'interleaved':
                        interleaved,
                        'sequencing_tech':
                        sequencing_tech
                    })['obj_ref']
                    lib_obj_refs.append(lib_obj_ref)
                    os.remove(fastq_file_path)  # free up disk

                    # add to readsSet
                    readsSet_items.append({
                        'ref': lib_obj_ref,
                        'label': output_obj_name
                    })
            # create readsset
            readsSet_obj_ref = None
            if len(lib_obj_refs) > 1:
                readsSet_obj = {
                    'description':
                    "Grinder Metagenome from " + " ".join(genome_obj_names),
                    'items':
                    readsSet_items
                }
                readsSet_obj_name = params['output_name']
                readsSet_obj_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['workspace_name'],
                    'output_object_name':
                    readsSet_obj_name,
                    'data':
                    readsSet_obj
                })['set_ref']

        #### STEP 5: Build report
        ##
        reportName = 'kb_grinder_report_' + str(uuid.uuid4())
        reportObj = {
            'objects_created': [],
            #'text_message': '',  # or is it 'message'?
            'message': '',  # or is it 'text_message'?
            'direct_html': '',
            #'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # message
        if len(invalid_msgs) > 0:
            report_text = "\n".join(invalid_msgs)
        reportObj['message'] = report_text

        if len(invalid_msgs) == 0:
            # objs
            if readsSet_obj_ref != None:
                reportObj['objects_created'].append({
                    'ref':
                    readsSet_obj_ref,
                    'desc':
                    params['output_name'] + " ReadsSet"
                })
            for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs):
                reportObj['objects_created'].append({
                    'ref':
                    lib_obj_ref,
                    'desc':
                    lib_obj_names[lib_obj_i]
                })
            # downloadable data
            for data_i, data_path in enumerate(struct_file_paths):
                try:
                    upload_ret = dfu.file_to_shock({
                        'file_path': data_path,
                        #'pack': 'zip'})
                        'make_handle': 0
                    })
                except:
                    raise ValueError('error uploading ' + data_path +
                                     ' file to shock')
                reportObj['file_links'].append({
                    'shock_id':
                    upload_ret['shock_id'],
                    'name':
                    struct_file_names[data_i],
                    'label':
                    struct_file_names[data_i]
                })

            # html report
            """
            try:
                html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir,
                                                     'make_handle': 0,
                                                     'pack': 'zip'})
            except:
                raise ValueError ('error uploading html report to shock')
            reportObj['direct_html_link_index'] = 0
            reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                        'name': html_file,
                                        'label': params['output_name']+' HTML'
                                    }
                                   ]
            """

        # save report object
        #
        SERVICE_VER = 'release'
        reportClient = KBaseReport(self.callbackURL,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
        report_info = reportClient.create_extended_report(reportObj)

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END KButil_Build_InSilico_Metagenomes_with_Grinder

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value '
                + 'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Exemplo n.º 5
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.dfu = DataFileUtil(self.cfg.callbackURL)

    def import_file(self, params):

        # 1) validate parameters
        self._validate_import_file_params(params)

        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)

        # 3) extract out the parameters
        params = self._set_parsed_params(params)

        # 4) do the upload
        result = self.upload_genome(
            shock_service_url=self.cfg.shockURL,
            handle_service_url=self.cfg.handleURL,
            workspace_service_url=self.cfg.workspaceURL,
            callback_url=self.cfg.callbackURL,
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            taxon_wsname=params['taxon_wsname'],
            taxon_reference=params['taxon_reference'],
            source=params['source'],
            genome_type=params['type'],
            release=params['release'])

        # 5) generate report
        output_data_ref = params['workspace_name'] + "/" + params['genome_name']
        reportObj = {
            'objects_created': [{
                'ref': output_data_ref,
                'description': 'KBase Genome object'
            }],
            'text_message':
            result['report_string']
        }

        reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL'])
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        # 6) clear the temp directory
        shutil.rmtree(input_directory)

        # 7) return the result
        info = result['genome_info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return details

    def upload_genome(self,
                      shock_service_url=None,
                      handle_service_url=None,
                      workspace_service_url=None,
                      callback_url=None,
                      input_gff_file=None,
                      input_fasta_file=None,
                      workspace_name=None,
                      core_genome_name=None,
                      scientific_name="unknown_taxon",
                      taxon_wsname='ReferenceTaxons',
                      taxon_reference=None,
                      source=None,
                      release=None,
                      genome_type=None):

        # retrieve taxon
        taxonomy, taxon_reference = self._retrieve_taxon(
            taxon_reference, taxon_wsname, scientific_name)

        # reading in Fasta file
        assembly = self._retrieve_fasta_file(input_fasta_file,
                                             core_genome_name, scientific_name,
                                             source)

        if taxon_reference is not None:
            assembly["taxon_ref"] = taxon_reference

        # reading in GFF file
        feature_list = self._retrieve_gff_file(input_gff_file)

        # compile links between features
        feature_hierarchy = self._generate_feature_hierarchy(feature_list)

        # retrieve genome feature list
        (genome_features_list, genome_mrnas_list,
         genome_cdss_list) = self._retrieve_genome_feature_list(
             feature_list, feature_hierarchy, assembly)

        # remove sequences before loading
        for contig in assembly["contigs"]:
            del assembly["contigs"][contig]["sequence"]

        aUtil = AssemblyUtil(callback_url)
        assembly_ref = aUtil.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file,
                'assembly_name': assembly['assembly_id']
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            assembly['assembly_id']
        })

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, genome_features_list,
                                       genome_cdss_list, genome_mrnas_list,
                                       source, assembly, taxon_reference,
                                       taxonomy, input_gff_file)

        workspace_id = self.dfu.ws_name_to_id(workspace_name)
        genome_info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "name": core_genome_name,
                "type": "KBaseGenomes.Genome",
                "data": genome
            }]
        })[0]
        report_string = ''

        return {'genome_info': genome_info, 'report_string': report_string}

    def _validate_import_file_params(self, params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(file.keys())
                raise ValueError(error_msg)

        # check for valid type param
        valid_types = ['Reference', 'User upload', 'Representative']
        if params.get('type') and params['type'] not in valid_types:
            error_msg = 'Entered value for type is not one of the valid entries of '
            error_msg += '[' + ''.join('"' + str(e) + '", '
                                       for e in valid_types)[0:-2] + ']'
            raise ValueError(error_msg)

    def _set_parsed_params(self, params):
        log('Setting params')

        # default params
        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'type': 'User upload',
            'metadata': {}
        }

        for field in default_params:
            if field not in params:
                params[field] = default_params[field]

        log(json.dumps(params, indent=1))

        return params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                print("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name):
        """
        _retrieve_taxon: retrieve taxonomy and taxon_reference

        """
        taxon_id = -1
        taxon_object_name = "unknown_taxon"

        # retrieve lookup object if scientific name provided
        if (taxon_reference is None
                and scientific_name is not "unknown_taxon"):
            # retrieve taxon lookup object then find taxon id
            taxon_lookup = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/taxon_lookup"],
                'ignore_errors':
                0
            })['data'][0]['data']['taxon_lookup']

            if (scientific_name[0:3] in taxon_lookup
                    and scientific_name in taxon_lookup[scientific_name[0:3]]):
                taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name]
                taxon_object_name = "{}_taxon".format(str(taxon_id))

        # retrieve Taxon object
        taxon_info = {}
        if (taxon_reference is None):
            taxon_info = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/" + taxon_object_name],
                'ignore_errors':
                0
            })['data'][0]
            taxon_reference = "{}/{}/{}".format(taxon_info['info'][6],
                                                taxon_info['info'][0],
                                                taxon_info['info'][4])
        else:
            taxon_info = self.dfu.get_objects({
                "object_refs": [taxon_reference],
                'ignore_errors': 0
            })['data'][0]

        taxonomy = taxon_info['data']['scientific_lineage']

        return taxonomy, taxon_reference

    def _retrieve_fasta_file(self, input_fasta_file, core_genome_name,
                             scientific_name, source):
        """
        _retrieve_fasta_file: retrieve info from fasta_file
                              https://www.biostars.org/p/710/

        """
        log("Reading FASTA file")

        assembly = {
            "contigs": {},
            "dna_size": 0,
            "gc_content": 0,
            "md5": [],
            "base_counts": {}
        }
        contig_seq_start = 0

        input_file_handle = open(input_fasta_file, 'rb')

        # alternate header and sequence
        faiter = (x[1] for x in itertools.groupby(input_file_handle,
                                                  lambda line: line[0] == ">"))
        for header in faiter:
            # drop the ">"
            header = header.next()[1:].strip()
            # join all sequence lines to one.
            seq = "".join(s.strip() for s in faiter.next())

            try:
                fasta_header, fasta_description = header.split(' ', 1)
            except:
                fasta_header = header
                fasta_description = None

            # Handle record
            seq = seq.upper()

            # Build contig objects for Assembly
            seq_count = dict(collections.Counter(seq))

            # to delete at end, but required for now
            contig_dict = {"sequence": seq}

            Ncount = 0
            if "N" in seq_count:
                Ncount = seq_count["N"]
            contig_dict["Ncount"] = Ncount

            for character in seq_count:
                if character in assembly["base_counts"]:
                    assembly["base_counts"][character] += seq_count[character]
                else:
                    assembly["base_counts"][character] = seq_count[character]

            contig_seq_length = len(seq)
            assembly["dna_size"] += contig_seq_length
            contig_gc_length = seq.count("G")
            contig_gc_length += seq.count("C")
            contig_dict["gc_content"] = float("{0:.2f}".format(
                float(contig_gc_length) / float(contig_seq_length)))
            assembly["gc_content"] += contig_gc_length
            contig_dict["contig_id"] = fasta_header
            contig_dict["name"] = fasta_header
            contig_dict["length"] = contig_seq_length
            contig_dict["md5"] = hashlib.md5(seq).hexdigest()
            assembly["md5"].append(contig_dict["md5"])

            if fasta_description is not None:
                contig_dict["description"] = fasta_description

            contig_dict["is_circular"] = "Unknown"
            contig_dict["start_position"] = contig_seq_start
            contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"])
            assembly["contigs"][fasta_header] = contig_dict

            # used for start of next sequence and total gc_content
            contig_seq_start += contig_seq_length

        assembly["gc_content"] = float("{0:.2f}".format(
            float(assembly["gc_content"]) / float(contig_seq_start)))
        assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest()
        assembly["assembly_id"] = core_genome_name + "_assembly"
        assembly["name"] = scientific_name
        assembly["external_source"] = source
        assembly["external_source_id"] = os.path.basename(input_fasta_file)
        assembly["external_source_origination_date"] = str(
            os.stat(input_fasta_file).st_ctime)
        assembly["num_contigs"] = len(assembly["contigs"].keys())
        assembly["type"] = "Unknown"
        assembly[
            "notes"] = "Note MD5s are generated from uppercasing the sequences"

        return assembly

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = dict()
        is_phytozome = 0
        is_patric = 0

        gff_file_handle = open(input_gff_file, 'rb')
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if ("phytozome" in source_id or "Phytozome" in source_id):
                    is_phytozome = 1

                #Checking to see if Phytozome
                if ("PATRIC" in source_id):
                    is_patric = 1

                #PATRIC prepends their contig ids with some gibberish
                if (is_patric and "|" in contig_id):
                    contig_id = contig_id.split("|", 1)[1]

                #Features grouped by contigs first
                if (contig_id not in feature_list):
                    feature_list[contig_id] = list()

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': attributes
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    ftr[key] = value

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if (is_phytozome):
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline
        if (is_phytozome):
            self._print_phytozome_gff(input_gff_file, feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):

        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list.keys():
            for i in range(len(feature_list[contig])):
                if ("ID" not in feature_list[contig][i]):
                    for key in ("transcriptId", "proteinId", "PACid", "pacid",
                                "Parent"):
                        if (key in feature_list[contig][i]):
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i][key]
                            break

                    #If the process fails, throw an error
                    for ftr_type in ("gene", "mRNA", "CDS"):
                        if (ftr_type not in feature_list[contig][i]):
                            continue

                        if ("ID" not in feature_list[contig][i]):
                            log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \
                                    feature_list[contig][i]['contig']+"."+ \
                                    feature_list[contig][i]['source']+"."+ \
                                    feature_list[contig][i]['type']+": "+ \
                                    feature_list[contig][i]['attributes'])
        return feature_list

    def _generate_feature_hierarchy(self, feature_list):

        feature_hierarchy = {contig: {} for contig in feature_list}

        #Need to remember mRNA/gene links for CDSs
        mRNA_gene_dict = {}
        exon_list_position_dict = {}

        for contig in feature_list:
            for i in range(len(feature_list[contig])):
                ftr = feature_list[contig][i]

                if ("gene" in ftr["type"]):
                    feature_hierarchy[contig][ftr["ID"]] = {
                        "utrs": [],
                        "mrnas": [],
                        "cdss": [],
                        "index": i
                    }

                if ("UTR" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[
                        ftr["Parent"]]]["utrs"].append({
                            "id": ftr["ID"],
                            "index": i
                        })

                if ("RNA" in ftr["type"]):
                    feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({
                        "id":
                        ftr["ID"],
                        "index":
                        i,
                        "cdss": []
                    })
                    mRNA_gene_dict[ftr["ID"]] = ftr["Parent"]
                    exon_list_position_dict[ftr["ID"]] = len(
                        feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1

                if ("CDS" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\
                        [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } )

        return feature_hierarchy

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list.keys():
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    def _update_phytozome_features(self, feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list.keys():
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("ID", "PACid", "pacid"):
                    if (key in feature_list[contig][i]):
                        old_id = feature_list[contig][i][key]
                        break
                if (old_id is None):
                    #This should be an error
                    print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\
                               feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("Name" in feature_list[contig][i]):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "Name"]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended
        #4) CDS appended with an incremented digit

        CDS_count_dict = dict()
        mRNA_parent_dict = dict()

        for contig in feature_list.keys():
            for ftr in feature_list[contig]:
                if ("Parent" in ftr):

                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

                    if ("CDS" in ftr["type"]):
                        #Increment CDS identifier
                        if (ftr["ID"] not in CDS_count_dict):
                            CDS_count_dict[ftr["ID"]] = 1
                        else:
                            CDS_count_dict[ftr["ID"]] += 1
                        ftr["ID"] = ftr["ID"] + "." + str(
                            CDS_count_dict[ftr["ID"]])

                        #Recall new mRNA id for parent
                        ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]]

        return feature_list

    def _print_phytozome_gff(self, input_gff_file, feature_list):

        #Write modified feature ids to new file
        input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz"
        try:
            print "Printing to new file: " + input_gff_file
            gff_file_handle = gzip.open(input_gff_file, 'wb')
        except:
            print "Failed to open"

        for contig in sorted(feature_list.iterkeys()):
            for ftr in feature_list[contig]:

                #Re-build attributes
                attributes_dict = {}
                for attribute in ftr["attributes"].split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    if (ftr[key] != value):
                        value = ftr[key]
                    attributes_dict[key] = value

                ftr["attributes"] = ";".join(key + "=" + attributes_dict[key]
                                             for key in attributes_dict.keys())

                new_line = "\t".join(
                    str(ftr[key]) for key in [
                        'contig', 'source', 'type', 'start', 'end', 'score',
                        'strand', 'phase', 'attributes'
                    ])
                gff_file_handle.write(new_line)
        gff_file_handle.close()
        return

    def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy,
                                      assembly):

        genome_features_list = list()
        genome_mrnas_list = list()
        genome_cdss_list = list()
        genome_translation_issues = list()

        for contig in feature_hierarchy:
            for gene in feature_hierarchy[contig]:

                #We only iterate through the gene objects
                #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly

                ftr = feature_list[contig][feature_hierarchy[contig][gene]
                                           ["index"]]
                contig_sequence = assembly["contigs"][
                    ftr["contig"]]["sequence"]
                gene_ftr = self._convert_ftr_object(
                    ftr, contig_sequence
                )  #reverse-complementation for negative strands done here

                #Add non-optional terms
                gene_ftr["mrnas"] = list()
                gene_ftr["cdss"] = list()
                gene_ftr["ontology_terms"] = dict()

                #Retaining longest sequences for gene feature
                longest_protein_length = 0
                longest_protein_sequence = ""
                for mRNA in feature_hierarchy[contig][gene]["mrnas"]:

                    ########################################################
                    # Construct mRNA Ftr
                    ########################################################
                    ftr = feature_list[contig][mRNA["index"]]
                    contig_sequence = assembly["contigs"][
                        ftr["contig"]]["sequence"]
                    mRNA_ftr = self._convert_ftr_object(
                        ftr, contig_sequence
                    )  #reverse-complementation for negative strands done here

                    #Modify mrna object for use in mrna array
                    #Objects will be un-used until further notice
                    mRNA_ftr['parent_gene'] = gene_ftr['id']

                    #If there are CDS, then New CDS ID without incrementation as they were aggregated
                    if (len(mRNA['cdss']) > 0):
                        mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS"
                    else:
                        mRNA_ftr['cds'] = ""

                    #Add to mrnas array
                    genome_mrnas_list.append(mRNA_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["mrnas"].append(mRNA_ftr["id"])

                    ########################################################
                    # Construct transcript, protein sequence, UTR, CDS locations
                    ########################################################

                    #At time of writing, all of this aggregation should probably be done in a single function
                    cds_exons_locations_array = list()
                    cds_cdna_sequence = str()
                    protein_sequence = str()
                    if (len(mRNA["cdss"]) > 0):
                        (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \
                            self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues)

                    UTRs = list()
                    if ("utrs" in feature_hierarchy[contig][gene] and
                            len(feature_hierarchy[contig][gene]["utrs"]) > 0):
                        for UTR in feature_hierarchy[contig][gene]["utrs"]:
                            ftr = feature_list[contig][UTR["index"]]
                            if ("Parent" in ftr
                                    and ftr["Parent"] == mRNA_ftr["id"]):
                                UTRs.append(ftr)

                    mrna_exons_locations_array = copy.deepcopy(
                        cds_exons_locations_array)
                    mrna_transcript_sequence = str(cds_cdna_sequence)
                    if (len(UTRs) > 0):
                        (mrna_exons_locations_array, mrna_transcript_sequence) = \
                            self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence)

                    #Update sequence and locations
                    mRNA_ftr["dna_sequence"] = mrna_transcript_sequence
                    mRNA_ftr["dna_sequence_length"] = len(
                        mrna_transcript_sequence)
                    mRNA_ftr["location"] = mrna_exons_locations_array
                    mRNA_ftr["md5"] = hashlib.md5(
                        mRNA_ftr["dna_sequence"]).hexdigest()

                    #Remove DNA
                    del mRNA_ftr["dna_sequence"]
                    del mRNA_ftr["dna_sequence_length"]

                    #Skip CDS if not present
                    if (len(mRNA["cdss"]) == 0):
                        continue

                    #Remove asterix representing stop codon if present
                    if (len(protein_sequence) > 0
                            and protein_sequence[-1] == '*'):
                        protein_sequence = protein_sequence[:-1]

                    #Save longest sequence
                    if (len(protein_sequence) > longest_protein_length):
                        longest_protein_length = len(protein_sequence)
                        longest_protein_sequence = protein_sequence

                    ########################################################
                    # Construct CDS Ftr
                    ########################################################
                    CDS_ftr = dict()
                    CDS_ftr['type'] = 'CDS'

                    #New CDS ID without incrementation as they were aggregated
                    CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS'

                    #Add gene/mrna links
                    CDS_ftr['parent_gene'] = gene_ftr['id']
                    CDS_ftr['parent_mrna'] = mRNA_ftr['id']

                    #Update sequence and locations
                    CDS_ftr["dna_sequence"] = cds_cdna_sequence
                    CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence)
                    CDS_ftr["location"] = cds_exons_locations_array
                    CDS_ftr["md5"] = hashlib.md5(
                        CDS_ftr["dna_sequence"]).hexdigest()

                    #Add protein
                    CDS_ftr["protein_translation"] = str(
                        protein_sequence).upper()
                    CDS_ftr["protein_translation_length"] = len(
                        CDS_ftr["protein_translation"])
                    #Only generate md5 for dna sequences
                    #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest()

                    #Add empty non-optional fields for populating in future
                    CDS_ftr["ontology_terms"] = dict()
                    if ("aliases" not in CDS_ftr):
                        CDS_ftr["aliases"] = list()
                    if ("function" not in CDS_ftr):
                        CDS_ftr["function"] = ""

                    #Add to cdss array
                    genome_cdss_list.append(CDS_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["cdss"].append(CDS_ftr["id"])

                gene_ftr["protein_translation"] = longest_protein_sequence
                gene_ftr["protein_translation_length"] = longest_protein_length
                genome_features_list.append(gene_ftr)

        msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format(
            len(genome_features_list), len(genome_mrnas_list),
            len(genome_cdss_list))
        msg += "{} mRNA(s) had errors during translation".format(
            len(genome_translation_issues))
        log(msg)

        return genome_features_list, genome_mrnas_list, genome_cdss_list

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         genome_features_list, genome_cdss_list,
                         genome_mrnas_list, source, assembly, taxon_reference,
                         taxonomy, input_gff_file):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome["features"] = genome_features_list
        genome["cdss"] = genome_cdss_list
        genome["mrnas"] = genome_mrnas_list
        genome["source"] = source
        genome["domain"] = "Eukaryota"
        genome["genetic_code"] = 1
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]

        if taxon_reference is not None:
            genome["taxon_ref"] = taxon_reference
            genome["taxonomy"] = taxonomy

        gff_file_to_shock = self.dfu.file_to_shock({
            'file_path': input_gff_file,
            'make_handle': 1,
            'pack': "gzip"
        })
        gff_handle_ref = gff_file_to_shock['handle']['hid']

        genome['gff_handle_ref'] = gff_handle_ref

        return genome

    def _convert_ftr_object(self, old_ftr, contig):
        new_ftr = dict()
        new_ftr["id"] = old_ftr["ID"]

        dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]],
                           IUPAC.ambiguous_dna)

        # reverse complement
        if (old_ftr["strand"] == "-"):
            dna_sequence = dna_sequence.reverse_complement()
            old_start = old_ftr["start"]
            old_ftr["start"] = old_ftr["end"]
            old_ftr["end"] = old_start

        new_ftr["dna_sequence"] = str(dna_sequence).upper()
        new_ftr["dna_sequence_length"] = len(dna_sequence)
        new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest()
        new_ftr["location"] = [[
            old_ftr["contig"], old_ftr["start"], old_ftr["strand"],
            len(dna_sequence)
        ]]
        new_ftr["type"] = old_ftr["type"]

        new_ftr["aliases"] = list()
        for key in ("transcriptId", "proteinId", "PACid", "pacid"):
            if (key in old_ftr.keys()):
                new_ftr["aliases"].append(key + ":" + old_ftr[key])

        return new_ftr

    def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence):

        #create copies of locations and transcript
        utrs_exons = list(exons)
        utr_exon_sequence = exon_sequence

        five_prime_dna_sequence = ""
        three_prime_dna_sequence = ""
        five_prime_locations = list()
        three_prime_locations = list()

        for UTR in (utr_list):
            contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"]
            UTR_ftr = self._convert_ftr_object(
                UTR, contig_sequence
            )  #reverse-complementation for negative strands done here

            #aggregate sequences and locations
            if ("five_prime" in UTR_ftr["id"]):
                five_prime_dna_sequence += UTR_ftr["dna_sequence"]
                five_prime_locations.append(UTR_ftr["location"][0])
            if ("three_prime" in UTR_ftr["id"]):
                three_prime_dna_sequence += UTR_ftr["dna_sequence"]
                three_prime_locations.append(UTR_ftr["location"][0])

        #Handle five_prime UTRs
        if (len(five_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file)
            five_prime_locations = sorted(five_prime_locations,
                                          key=lambda x: x[1])

            #Merge last UTR with CDS if "next" to each other
            if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \
                ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ):

                #Remove last UTR
                last_five_prime_location = five_prime_locations[-1]
                five_prime_locations = five_prime_locations[:-1]

                #"Add" last UTR to first exon
                utrs_exons[0][1] = last_five_prime_location[1]
                utrs_exons[0][3] += last_five_prime_location[3]

            #Prepend other UTRs if available
            if (len(five_prime_locations) > 0):
                utrs_exons = five_prime_locations + utrs_exons

        utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence

        #Handle three_prime UTRs
        if (len(three_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file
            three_prime_locations = sorted(three_prime_locations,
                                           key=lambda x: x[1])

            #Merge first UTR with CDS if "next to each other
            if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \
                ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ):

                #Remove first UTR
                first_three_prime_location = three_prime_locations[0]
                three_prime_locations = three_prime_locations[1:]

                #"Add" first UTR to last exon
                utrs_exons[-1][3] += first_three_prime_location[3]

        #Append other UTRs if available
        if (len(three_prime_locations) > 0):
            utrs_exons = utrs_exons + three_prime_locations

        utr_exon_sequence += three_prime_dna_sequence

        return (utrs_exons, utr_exon_sequence)

    def _cds_aggregation_translation(self, cds_list, feature_list, assembly,
                                     issues):

        dna_sequence = ""
        locations = list()

        # collect phases, and lengths of exons
        # right now, this is only for the purpose of error reporting
        phases = list()
        exons = list()

        #Saving parent mRNA identifier
        Parent_mRNA = cds_list[0]["id"]
        for CDS in (cds_list):
            ftr = feature_list[CDS["index"]]
            phases.append(ftr["phase"])
            Parent_mRNA = ftr["Parent"]

            contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"]
            CDS_ftr = self._convert_ftr_object(
                ftr, contig_sequence
            )  #reverse-complementation for negative strands done here
            exons.append(len(CDS_ftr["dna_sequence"]))

            # Remove base(s) according to phase, but only for first CDS
            if (CDS == cds_list[0] and int(ftr["phase"]) != 0):
                log("Adjusting phase for first CDS: " + CDS["id"])
                CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][
                    int(ftr["phase"]):]

            #aggregate sequences and locations
            dna_sequence += CDS_ftr["dna_sequence"]
            locations.append(CDS_ftr["location"][0])

        # translate sequence
        dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna)
        rna_sequence = dna_sequence_obj.transcribe()

        # incomplete gene model with no start codon
        if str(rna_sequence.upper())[:3] not in codon_table.start_codons:
            msg = "Missing start codon for {}. Possibly incomplete gene model.".format(
                Parent_mRNA)
            log(msg)

        # You should never have this problem, needs to be reported rather than "fixed"
        codon_count = len(str(rna_sequence)) % 3
        if codon_count != 0:
            msg = "Number of bases for RNA sequence for {} ".format(
                Parent_mRNA)
            msg += "is not divisible by 3. "
            msg += "The resulting protein may well be mis-translated."
            log(msg)
            issues.append(Parent_mRNA)

        protein_sequence = Seq("")
        try:
            protein_sequence = rna_sequence.translate()
        except CodonTable.TranslationError as te:
            log("TranslationError for: " + feature_object["id"], phases, exons,
                " : " + str(te))

        return (locations, dna_sequence.upper(), str(protein_sequence).upper())