Exemplo n.º 1
0
    def UploadFromMEME(self, ctx, params):
        """
        :param params: instance of type "UploadGibbsInParams" -> structure:
           parameter "path" of String, parameter "ws_name" of String,
           parameter "obj_name" of String
        :returns: instance of type "UploadOutput" -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFromMEME
        print('Extracting motifs')
        motifList = MU.parse_meme_output(params['path'])
        print(motifList)

        MSO = {}
        MSO['Condition'] = 'Temp'
        MSO['SequenceSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A', 'C', 'G', 'T']
        MSO['Background'] = {}
        for letter in MSO['Alphabet']:
            MSO['Background'][letter] = 0.0

        MSU.parseMotifList(motifList, MSO)
        MSU.CheckLength(MSO, params['min_len'], params['max_len'])
        if 'absolute_locations' in params:
            for motif in MSO['Motifs']:
                for loc in motif['Motif_Locations']:
                    if loc['sequence_id'] in params['absolute_locations']:
                        loc['sequence_id'] = params['contig']
                        absStart = int(params['start'])
                        loc['start'] = absStart
                        loc['end'] = absStart + loc['end']

        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': MSO,
            'name': params['obj_name']
        }]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref': motif_set_ref}
        print(output)

        #END UploadFromMEME

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFromMEME return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def _save_to_ws_and_report(self, ws_id, source, assembly_data):
        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        workspace_id = dfu.ws_name_to_id(self.getWsName())
        print("Workspace id: {}".format(workspace_id))
        info = dfu.save_objects({
            'id':
            '18590',  # Numerical id of workspace
            "objects": [{
                "type": "KBaseGenomeAnnotations.Assembly-3.0",
                "data": assembly_data,
                "name": ws_id
            }]
        })[0]
        #print("Data from save to ws: {}".format(json.dumps(info, indent=2)))
        assembly_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        return assembly_ref
Exemplo n.º 3
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"


    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print('parsing FASTA file: ' + str(fasta_file_path))
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' +
              str(assembly_data['dna_size']) + 'bp')

        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info


    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        ''' construct the WS object data to save based on the parsed info and params '''
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            assembly_data['taxon_ref'] = params['taxon_ref']

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return assembly_data


    def parse_fasta(self, fasta_file_path, params):
        ''' Do the actual work of inspecting each contig '''

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This fasta file may have amino acids in it instead ' +
                                         'of the required nucleotides.')
                    raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character))

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The fasta header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')
            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list))).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data


    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' +
              str(min_contig_length) + 'bp.')


    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        ''' removes all contigs less than the min_contig_length provided '''
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path


    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info


    def save_fasta_file_to_shock(self, fasta_file_path):
        ''' Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        '''
        print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})


    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and returning the path to the file'''
        file_path = None
        if 'file' in params:
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print('Downloading file from SHOCK node: ' + str(params['shock_id']))
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print('Downloading file from: ' + str(params['ftp_url']))
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid fasta could be extracted based on the input parameters')


    def validate_params(self, params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one fasta file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
Exemplo n.º 4
0
class GenDiffExprMatrix:

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'DEM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.setAPI = SetAPI(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self._mkdir_p(self.scratch)

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def setup_data(self):

        self.new_col_names = [
            'gene_id', 'log2_fold_change', 'p_value', 'q_value'
        ]

    def get_feature_ids(self, genome_ref):
        """
        _get_feature_ids: get feature ids from genome
        """

        feature_num = self.gsu.search({'ref': genome_ref})['num_found']

        genome_features = self.gsu.search({
            'ref': genome_ref,
            'limit': feature_num,
            'sort_by': [['feature_id', True]]
        })['features']

        features_ids = map(
            lambda genome_feature: genome_feature.get('feature_id'),
            genome_features)

        return list(set(features_ids))

    def gen_matrix(self, infile, old_col_names, delimiter):
        with open(infile, 'rb') as source:
            rdr = csv.DictReader(source, delimiter=delimiter)
            col_names = self.new_col_names[1:]
            row_names = []
            values = []
            for row in rdr:
                try:
                    values.append([float(row[v]) for v in old_col_names[1:]])
                except:
                    values_list = []
                    for v in old_col_names[1:]:
                        tmpval = row[v]
                        if isinstance(tmpval, (int, long, float)):
                            values_list.append(float(tmpval))
                        elif isinstance(tmpval, basestring):
                            if 'na' in tmpval.lower(
                            ) or 'none' in tmpval.lower():
                                values_list.append(None)
                            else:
                                tmpval = tmpval.replace("'", "")
                                tmpval = tmpval.replace('"', '')
                                values_list.append(float(tmpval))
                        else:
                            raise ValueError(
                                "invalid type in input file: {}".format(
                                    tmpval))
                    values.append(values_list)
                row_names.append(row[old_col_names[0]])

        twoD_matrix = {
            'row_ids': row_names,
            'col_ids': col_names,
            'values': values
        }

        return twoD_matrix

    def get_max_fold_change_to_handle_inf(self, infile):
        maxvalue = 0
        with open(infile) as source:
            rdr = csv.DictReader(source, dialect='excel-tab')
            for line in rdr:
                log2fc_val = line.get('log2_fold_change')
                if not 'inf' in str(log2fc_val):
                    log2fc = abs(float(log2fc_val))
                    if log2fc > maxvalue:
                        maxvalue = log2fc

            print 'maxvalue: ', maxvalue
            return maxvalue

    def gen_cuffdiff_matrix(self, infile, delimiter='\t'):

        max_value = self.get_max_fold_change_to_handle_inf(infile)
        with open(infile, 'rb') as source:
            rdr = csv.DictReader(source, delimiter=delimiter)
            col_names = self.new_col_names[1:]

            row_names = []
            values = []
            for row in rdr:

                log2fc_val = row.get('log2_fold_change')
                # print 'FC_VAL: ', log2fc_val
                if '-inf' in str(log2fc_val):
                    row['log2_fold_change'] = -float(max_value)
                elif 'inf' in str(log2fc_val):
                    row['log2_fold_change'] = float(max_value)
                elif 'nan' in str(log2fc_val):
                    row['log2_fold_change'] = None

                try:
                    values.append(
                        [float(row[v]) for v in self.new_col_names[1:]])
                except:
                    values.append(
                        [None] +
                        [float(row[v]) for v in self.new_col_names[2:]])

                row_names.append(row[self.new_col_names[0]])

        tmatrix = {
            'row_ids': row_names,
            'col_ids': col_names,
            'values': values
        }

        return tmatrix

    def save_diff_expr_matrix(self, obj_name, data_matrix, condition1,
                              condition2):

        dem_data = {
            'genome_ref': self.params.get('genome_ref'),
            'data': data_matrix,
            'condition_mapping': {
                condition1: condition2
            },
            'type': 'log2_level',
            'scale': '1.0'
        }
        res = self.dfu.save_objects({
            'id':
            self.params.get('ws_id'),
            "objects": [{
                "type":
                "KBaseFeatureValues.DifferentialExpressionMatrix",
                "data":
                dem_data,
                "name":
                obj_name,
                "extra_provenance_input_refs": [self.params.get('genome_ref')]
            }]
        })[0]
        ret_ref = str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        return ret_ref

    def save_diff_expr_matrix_set(self, obj_name, matrix_set):

        res = self.setAPI.save_differential_expression_matrix_set_v1({
            "workspace":
            self.params.get('ws_name'),
            "output_object_name":
            obj_name,
            "data":
            matrix_set
        })
        return res.get('set_ref')

    #
    # ballgown always outputs a linear fold change, which we need to convert to log2
    # before storing
    #

    def safely_apply_log2_to_fc(self, row):
        if row[0]:
            fc = row[0]
            if fc < 1.0e-10:
                fc = fc + 1.0e-10  # incase fc is zero
            return ([log2(fc)] + row[1:])
        else:
            return (row)

    def process_ballgown_file(self, diffexpr_filepath):

        ballgown_col_names = ['id', 'fc', 'pval', 'qval']

        data_matrix = self.gen_matrix(diffexpr_filepath,
                                      ballgown_col_names,
                                      delimiter='\t')
        log2_data_matrix = data_matrix
        log2_data_matrix['values'] = map(self.safely_apply_log2_to_fc,
                                         data_matrix.get('values'))

        dem_ref = self.save_diff_expr_matrix(
            self.params.get('obj_name') + '_0', log2_data_matrix, None, None)
        set_items = [{
            'label': 'global Differential Expression Data',
            'ref': dem_ref
        }]
        matrix_set = {
            'description': 'ballgown Diff Exp Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)

    def process_deseq_file(self, diffexpr_filepath):

        deseq_col_names = ['geneID', 'log2FoldChange', 'pvalue', 'padj']

        data_matrix = self.gen_matrix(diffexpr_filepath,
                                      deseq_col_names,
                                      delimiter=',')

        dem_ref = self.save_diff_expr_matrix(
            self.params.get('obj_name') + '_0', data_matrix, None, None)
        set_items = [{
            'label': 'global Differential Expression Data',
            'ref': dem_ref
        }]
        matrix_set = {
            'description': 'deseq Diff Exp Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)

    def process_cuffdiff_file(self, diffexpr_filepath):

        cuffdiff_col_names = [
            'gene', 'log2(fold_change)', 'p_value', 'q_value'
        ]

        ConditionPair = namedtuple("ConditionPair",
                                   ["condition1", "condition2"])
        FileInfo = namedtuple('FileInfo', ['file_path', 'file_obj'])

        condPair_fileInfo = {}

        timestamp = str(
            int((datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000))
        with open(diffexpr_filepath, 'rb') as source:
            rdr = csv.DictReader(source, dialect='excel-tab')
            """
            save the files opened for writing in outfiles list, so they can be closed later
            """
            outfiles = list()

            for r in rdr:
                c1 = r['sample_1']
                c2 = r['sample_2']

                cond_pair = ConditionPair(condition1=c1, condition2=c2)
                tsv_file_info = condPair_fileInfo.get(cond_pair, None)
                if tsv_file_info is None:
                    tsv_file_name = timestamp + '_' + c1 + '~~' + c2
                    tsv_file_path = os.path.join(self.scratch, tsv_file_name)
                    outfile = open(tsv_file_path, 'wb')
                    outfiles.append(outfile)
                    csv_wtr = csv.DictWriter(outfile,
                                             delimiter='\t',
                                             fieldnames=self.new_col_names)
                    csv_wtr.writerow(
                        dict((cn, cn) for cn in self.new_col_names))
                    tsv_file_info = FileInfo(file_path=tsv_file_path,
                                             file_obj=csv_wtr)
                    condPair_fileInfo[cond_pair] = tsv_file_info

                wtr = tsv_file_info.file_obj
                col_vals = [r[v] for v in cuffdiff_col_names]
                wtr.writerow(dict(zip(self.new_col_names, col_vals)))

            for ofile in outfiles:
                ofile.close()

            set_items = list()
            for cond_pair, file_info in condPair_fileInfo.iteritems():
                print 'Cond_pair: ', cond_pair
                print 'File: ', file_info.file_path
                tsv_file = file_info.file_path

                data_matrix = self.gen_cuffdiff_matrix(tsv_file)

                object_name = self.get_obj_name(self.params['obj_name'],
                                                cond_pair.condition1,
                                                cond_pair.condition2)
                dem_ref = self.save_diff_expr_matrix(object_name, data_matrix,
                                                     cond_pair.condition1,
                                                     cond_pair.condition2)
                print('process_cuffdiff_file: DEM_REF: ' + dem_ref)
                set_items.append({
                    'label':
                    cond_pair.condition1 + ', ' + cond_pair.condition2,
                    'ref':
                    dem_ref
                })

        matrix_set = {
            'description': 'cuffdiff Diff Exp Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)

    """
    Functions for save_differentialExpressionMatrixSet
    """

    def save_matrix(self, genome_ref, infile, in_col_names, delimiter):

        feature_ids = self.get_feature_ids(genome_ref)

        with open(infile, 'rb') as source:
            rdr = csv.DictReader(source, delimiter=delimiter)
            col_names = in_col_names[1:]
            row_names = []
            values = []
            for row in rdr:
                if row[in_col_names[0]] in feature_ids:
                    row_names.append(row[in_col_names[0]])
                else:
                    gene_ids = row[in_col_names[0]].strip().split(',')
                    match = True
                    mismatched_gene_ids = list()
                    for gene_id in gene_ids:
                        gene_id = gene_id.strip()
                        if gene_id not in feature_ids:
                            mismatched_gene_ids.append(gene_id)
                            match = False
                    if match:
                        row_names.append(row[in_col_names[0]])
                    else:
                        error_msg = 'Gene_id(s) "{}" is not a known feature in "{}"'.format(
                            ', '.join(mismatched_gene_ids),
                            self.params.get('genome_ref'))
                        raise ValueError(error_msg)
                try:
                    values.append([float(row[v]) for v in in_col_names[1:]])
                except:
                    values_list = []
                    for v in in_col_names[1:]:
                        tmpval = row[v]
                        if isinstance(tmpval, (int, long, float)):
                            values_list.append(float(tmpval))
                        elif isinstance(tmpval, basestring):
                            if 'na' in tmpval.lower(
                            ) or 'none' in tmpval.lower():
                                values_list.append(None)
                            else:
                                tmpval = tmpval.replace("'", "")
                                tmpval = tmpval.replace('"', '')
                                values_list.append(float(tmpval))
                        else:
                            raise ValueError(
                                "invalid type in input file: {}".format(
                                    tmpval))
                    values.append(values_list)

        twoD_matrix = {
            'row_ids': row_names,
            'col_ids': col_names,
            'values': values
        }

        return twoD_matrix

    @staticmethod
    def get_obj_name(obj_name, condition1, condition2):
        def sanitize(ws_name):
            # I'm not using translate because it's a mess with mixed unicode & strings
            return ws_name.replace("\t", " ").replace(" ",
                                                      "_").replace("/", "|")

        return "{}-{}-{}".format(obj_name, sanitize(condition1),
                                 sanitize(condition2))

    def gen_diffexpr_matrices(self, params):

        print('In GEN DEMs')
        self.params = params
        self.setup_data()
        diffexpr_filepath = self.params.get('diffexpr_filepath')

        if 'deseq' in self.params.get('tool_used').lower():
            dem_ref = self.process_deseq_file(diffexpr_filepath)
        elif 'ballgown' in self.params.get('tool_used').lower():
            dem_ref = self.process_ballgown_file(diffexpr_filepath)
        elif 'cuffdiff' in self.params.get('tool_used').lower():
            dem_ref = self.process_cuffdiff_file(diffexpr_filepath)
        else:
            raise ValueError('"{}" is not a valid tool_used parameter'.format(
                self.params.get('tool_used')))
        return dem_ref

    def save_diffexpr_matrices(self, params):

        print('In SAVE DEMs')
        self.params = params
        self.setup_data()

        set_items = list()
        for deFile in self.params.get('diffexpr_data'):
            condition_mapping = deFile.get('condition_mapping')
            diffexpr_filepath = deFile.get('diffexpr_filepath')

            if deFile.get('delimter', None) is not None:
                delimiter = deFile.get('delimter')
            else:
                delimiter = '\t'
                fileext = os.path.splitext(diffexpr_filepath)[1]

                if 'csv' in fileext.lower():
                    delimiter = ','
                elif 'tsv' in fileext.lower():
                    delimiter = '\t'
                else:
                    print('Using tab delimiter')

            data_matrix = self.save_matrix(self.params.get('genome_ref'),
                                           diffexpr_filepath,
                                           self.new_col_names, delimiter)

            condition1, condition2 = condition_mapping.items()[0]
            object_name = self.get_obj_name(self.params['obj_name'],
                                            condition1, condition2)
            dem_ref = self.save_diff_expr_matrix(object_name, data_matrix,
                                                 condition1, condition2)
            set_items.append({
                'label': condition1 + ', ' + condition2,
                'ref': dem_ref
            })

        matrix_set = {
            'description': self.params.get('tool_used') +
            ' Differential Expression Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)
Exemplo n.º 5
0
class ExprMatrixUtils:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME = 'workspace_name'
    PARAM_IN_OBJ_NAME = 'output_obj_name'
    PARAM_IN_EXPSET_REF = 'expressionset_ref'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        pass

    def process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [self.PARAM_IN_EXPSET_REF,
                  self.PARAM_IN_OBJ_NAME,
                  self.PARAM_IN_WS_NAME
                 ]:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        ws_name_id = params.get(self.PARAM_IN_WS_NAME)
        if not isinstance(ws_name_id, int):
            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)
        self.ws_id = ws_name_id

    def get_expressionset_data(self, expressionset_ref):

        expr_set_obj = self.ws_client.get_objects2(
            {'objects': [{'ref': expressionset_ref}]})['data'][0]

        expr_set_obj_type = expr_set_obj.get('info')[2]
        expr_set_data = dict()
        expr_set_data['ws_name'] = expr_set_obj.get('info')[7]
        expr_set_data['obj_name'] = expr_set_obj.get('info')[1]

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expr_set_obj_type):
            expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id']
            expr_obj_refs = list()
            for expr_obj in expr_set_obj['data']['mapped_expression_ids']:
                expr_obj_refs.append(expr_obj.values()[0])
            expr_set_data['expr_obj_refs'] = expr_obj_refs

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type):
            items = expr_set_obj.get('data').get('items')
            expr_obj_refs = list()
            for item in items:
                expr_obj_refs.append(item['ref'])
            expr_obj = self.ws_client.get_objects2(
                {'objects': [{'ref': expr_obj_refs[0]}]})['data'][0]
            expr_set_data['genome_ref'] = expr_obj['data']['genome_id']
            expr_set_data['expr_obj_refs'] = expr_obj_refs
        else:
            raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' +
                            'KBaseRNASeq.RNASeqExpressionSet ' +
                            'or KBaseSets.ExpressionSet')
        return expr_set_data

    def save_expression_matrix(self, tables, expr_set_data, em_obj_name, hidden = 0):

        all_rows = {}    # build a dictionary of keys only which is a union of all row ids (gene_ids)
        self.logger.info( '***** length of tables is {0}'.format( len( tables )))
        for table in tables:
            for r in table.keys():
                all_rows[r] = []

        for gene_id in all_rows.keys():
            row = []
            for table in tables:
                if ( gene_id in table ):
                    #logger.info( 'append ' + gene_id )
                    #logger.info( pformat( table[gene_id]))
                               #all_rows[gene_id].append( table[gene_id] )
                    row.append( table[gene_id] )
                else:
                    #logger.info( 'append  0' )
                    row.append( 0 )
                all_rows[gene_id] = row
                #logger.info( all_rows[gene_id])

        em_data = {
                    'genome_ref': expr_set_data['genome_ref'],
                    'scale': 'log2',
                    'type': 'level',
                    'data': {
                            'row_ids': [],
                            'values': [],
                            'col_ids': expr_set_data['expr_obj_names']
                            },
                    'feature_mapping' : {},
                    'condition_mapping': expr_set_data['condition_map']
                   }

        # we need to load row-by-row to preserve the order
        self.logger.info('loading expression matrix data')

        for gene_id in all_rows.keys():
            em_data['feature_mapping'][gene_id] = gene_id
            em_data['data']['row_ids'].append(gene_id)
            em_data['data']['values'].append(all_rows[gene_id])

        try:
            self.logger.info( 'saving em_data em_name {0}'.format(em_obj_name))
            obj_info = self.dfu.save_objects({'id': self.ws_id,
                                              'objects': [
                                                          { 'type': 'KBaseFeatureValues.ExpressionMatrix',
                                                            'data': em_data,
                                                            'name': em_obj_name,
                                                            'hidden': hidden,
                                                            'extra_provenance_input_refs': [
                                                                em_data.get('genome_ref'),
                                                                self.params[self.PARAM_IN_EXPSET_REF]]
                                                          }
                                                    ]})[0]
            self.logger.info('ws save return:\n' + pformat(obj_info))
        except Exception as e:
            self.logger.exception(e)
            raise Exception('Failed Saving Expression Matrix to Workspace')

        return str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4])

    def get_expression_matrix(self, params):

        self.process_params(params)
        self.params = params

        expressionset_ref = params.get(self.PARAM_IN_EXPSET_REF)

        expr_set_data = self.get_expressionset_data(expressionset_ref)
        expr_obj_names = list()
        fpkm_tables = list()
        tpm_tables = list()
        condition_map = dict()
        tpm_table = None
        for expr_obj_ref in expr_set_data['expr_obj_refs']:
            try:
                self.logger.info('*** getting expression set {0} from workspace ****'
                                 .format(expr_obj_ref))

                expr = self.ws_client.get_objects2(
                                            {'objects':
                                            [{'ref': expr_obj_ref}]})['data'][0]

            except Exception, e:
                self.logger.exception(e)
                raise Exception('Unable to download expression object {0} from workspace {1}'.
                                format(expr_obj_ref, expr_set_data['ws_name']))

            expr_name = expr.get('info')[1]
            expr_obj_names.append(expr_name)
            condition_map.update({expr_name: expr.get('data').get('condition')})
            num_interp = expr.get('data').get('numerical_interpretation')
            if num_interp != 'FPKM':
                raise Exception(
                    'Did not get expected FPKM value from numerical interpretation key from \
                     Expression object {0}, instead got '.format(expr_obj_ref, num_interp))

            pr_comments = expr.get('data').get('processing_comments', None)  # log2 Normalized
            if pr_comments is not None:
                self.logger.info('pr_comments are {0}'.format(pr_comments))

            fpkm_table = expr.get('data').get('expression_levels') # QUESTION: is this really FPKM levels?
            self.logger.info('FPKM keycount: {0}'.format(len(fpkm_table.keys())))
            fpkm_tables.append(fpkm_table)

            tpm_table = None  # Cufflinks doesn't generate TPM
            if 'tpm_expression_levels' in expr['data']:  # so we need to check for this key
                tpm_table = expr.get('data').get('tpm_expression_levels')
                self.logger.info('TPM keycount: {0}'.format(len(tpm_table.keys())))
                tpm_tables.append(tpm_table)

        expr_set_data['expr_obj_names'] = expr_obj_names
        expr_set_data['condition_map'] = condition_map
        output_obj_name = params.get(self.PARAM_IN_OBJ_NAME)
        fpkm_ref = self.save_expression_matrix(fpkm_tables,
                                               expr_set_data,
                                               '{0}_FPKM_ExpressionMatrix'.format(output_obj_name))
        tpm_ref = None
        if tpm_table is not None:
            tpm_ref = self.save_expression_matrix(tpm_tables,
                                                  expr_set_data,
                                                  '{0}_TPM_ExpressionMatrix'.format(output_obj_name))
        return fpkm_ref, tpm_ref
class ExpressionUtils:
    '''
    Module Name:
    ExpressionUtils

    Module Description:
    A KBase module: ExpressionUtils

This module is intended for use by Assemblers to upload RNASeq Expression files
(gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent.
The expression files are uploaded as a single compressed file.This module also generates
expression levels and tpm expression levels from the input files and saves them in the
workspace object. Once uploaded, the expression files can be downloaded onto an output directory.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.1.1"
    GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git"
    GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99"

    #BEGIN_CLASS_HEADER

    PARAM_IN_SRC_DIR = 'source_dir'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_ALIGNMENT_REF = 'alignment_ref'

    PARAM_IN_GENOME_REF = 'genome_ref'
    PARAM_IN_ANNOTATION_ID = 'annotation_id'
    PARAM_IN_BAM_FILE_PATH = 'bam_file_path'
    PARAM_IN_DESCRIPTION = 'description'
    PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level'
    PARAM_IN_PROC_COMMENTS = 'processing_comments'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'
    PARAM_IN_ORIG_MEDIAN = 'original_median'
    PARAM_IN_EXT_SRC_DATE = 'external_source_date'
    PARAM_IN_TRANSCRIPTS = 'transcripts'
    PARAM_IN_SRC = 'source'

    def _check_required_param(self, in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _proc_upload_expression_params(self, ctx, params):
        """
        Check the presence and validity of upload expression params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR,
            self.PARAM_IN_ALIGNMENT_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        source_dir = params.get(self.PARAM_IN_SRC_DIR)

        if not (os.path.isdir(source_dir)):
            raise ValueError('Source directory does not exist: ' + source_dir)

        if not os.listdir(source_dir):
            raise ValueError('Source directory is empty: ' + source_dir)

        return ws_name_id, obj_name_id, source_dir

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _get_genome_ref(self, assembly_or_genome_ref, params):
        if self.PARAM_IN_GENOME_REF in params and params[
                self.PARAM_IN_GENOME_REF] is not None:
            return params[self.PARAM_IN_GENOME_REF]

        obj_type = self._get_ws_info(assembly_or_genome_ref)[2]
        if obj_type.startswith('KBaseGenomes.Genome'):
            return assembly_or_genome_ref

        raise ValueError('Alignment object does not contain genome_ref; '
                         '"{}" parameter is required'.format(
                             self.PARAM_IN_GENOME_REF))

    def _get_expression_levels(self,
                               source_dir,
                               genome_ref,
                               transcripts=False):

        fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking')
        if transcripts:
            fpkm_file_path = os.path.join(source_dir, 't_data.ctab')

        if not os.path.isfile(fpkm_file_path):
            raise ValueError('{} file is required'.format(fpkm_file_path))

        id_col = 5 if transcripts else 0
        self.__LOGGER.info(
            'Generating expression levels from {}'.format(fpkm_file_path))
        return self.expression_utils.get_expression_levels(
            fpkm_file_path, genome_ref, id_col)

    def _gen_ctab_files(self, params, alignment_ref):

        source_dir = params.get(self.PARAM_IN_SRC_DIR)
        if len(glob.glob(source_dir + '/*.ctab')) < 5:

            self.__LOGGER.info(' =======  Generating ctab files ==========')
            gtf_file = os.path.join(source_dir, 'transcripts.gtf')
            if not os.path.isfile(gtf_file):
                raise ValueError(
                    "{} file is required to generate ctab files, found missing"
                    .format(gtf_file))

            if self.PARAM_IN_BAM_FILE_PATH in params and \
               params[self.PARAM_IN_BAM_FILE_PATH] is not None:
                bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH]
            else:
                self.__LOGGER.info(
                    'Downloading bam file from alignment object')
                rau = ReadsAlignmentUtils(self.callback_url)
                alignment_retVal = rau.download_alignment(
                    {'source_ref': alignment_ref})
                alignment_dir = alignment_retVal.get('destination_dir')

                allbamfiles = glob.glob(alignment_dir + '/*.bam')
                if len(allbamfiles) == 0:
                    raise ValueError('bam file does not exist in {}'.format(d))
                elif len(allbamfiles) == 1:
                    bam_file_path = allbamfiles[0]
                elif len(allbamfiles) > 1:
                    tmp_file_path = os.path.join(alignment_dir,
                                                 'accepted_hits.bam')
                    if os.path.isfile(tmp_file_path):
                        bam_file_path = tmp_file_path
                    else:
                        tmp_file_path = os.path.join(
                            alignment_dir, 'accepted_hits_sorted.bam')
                        if os.path.isfile(tmp_file_path):
                            bam_file_path = tmp_file_path
                        else:
                            raise ValueError(
                                'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}'
                                .format(alignment_dir))

            result = self.table_maker.build_ctab_files(
                ref_genome_path=gtf_file,
                alignment_path=bam_file_path,
                output_dir=source_dir)
            if result != 0:
                raise ValueError('Tablemaker failed')

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.__LOGGER = logging.getLogger('ExpressionUtils')
        self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s"
        )
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.config['SDK_CALLBACK_URL'] = self.callback_url
        self.expression_utils = Expression_Utils(self.config)
        self.dfu = DataFileUtil(self.callback_url)
        self.table_maker = TableMaker(config, self.__LOGGER)
        self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER)
        #END_CONSTRUCTOR
        pass

    def upload_expression(self, ctx, params):
        """
        Uploads the expression  *
        :param params: instance of type "UploadExpressionParams" (*   
           Required input parameters for uploading a reads expression data
           string   destination_ref        -   object reference of expression
           data. The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id string   source_dir             -  
           directory with the files to be uploaded string   alignment_ref    
           -   alignment workspace object reference *) -> structure:
           parameter "destination_ref" of String, parameter "source_dir" of
           String, parameter "alignment_ref" of String, parameter
           "genome_ref" of String, parameter "annotation_id" of String,
           parameter "bam_file_path" of String, parameter "transcripts" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
           1)), parameter "data_quality_level" of Long, parameter
           "original_median" of Double, parameter "description" of String,
           parameter "platform" of String, parameter "source" of String,
           parameter "external_source_date" of String, parameter
           "processing_comments" of String
        :returns: instance of type "UploadExpressionOutput" (*     Output
           from upload expression    *) -> structure: parameter "obj_ref" of
           String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_expression

        self.__LOGGER.info('Starting upload expression, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params(
            ctx, params)

        alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF)
        try:
            alignment_obj = self.dfu.get_objects(
                {'object_refs': [alignment_ref]})['data'][0]
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        alignment = alignment_obj['data']
        assembly_or_genome_ref = alignment['genome_id']

        genome_ref = self._get_genome_ref(assembly_or_genome_ref, params)

        expression_levels, tpm_expression_levels = self._get_expression_levels(
            source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS))

        self._gen_ctab_files(params, alignment_ref)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': source_dir,
            'make_handle': 1,
            'pack': 'zip'
        })
        """
        move the zipfile created in the source directory one level up
        """
        path, dir = os.path.split(source_dir)
        zipfile = dir + '.zip'
        if os.path.isfile(os.path.join(source_dir, zipfile)):
            shutil.move(os.path.join(source_dir, zipfile),
                        os.path.join(path, zipfile))

        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        expression_data = {
            'numerical_interpretation': 'FPKM',
            'genome_id': genome_ref,
            'mapped_rnaseq_alignment': {
                alignment['read_sample_id']: alignment_ref
            },
            'condition': alignment['condition'],
            'file': file_handle,
            'expression_levels': expression_levels,
            'tpm_expression_levels': tpm_expression_levels
        }
        additional_params = [
            self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION,
            self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM,
            self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID,
            self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE,
            self.PARAM_IN_SRC
        ]

        for opt_param in additional_params:
            if opt_param in params and params[opt_param] is not None:
                expression_data[opt_param] = params[opt_param]

        extra_provenance_input_refs = list()
        extra_provenance_input_refs.append(
            params.get(self.PARAM_IN_ALIGNMENT_REF))
        if self.PARAM_IN_GENOME_REF in params and params.get(
                self.PARAM_IN_GENOME_REF) is not None:
            extra_provenance_input_refs.append(
                params.get(self.PARAM_IN_GENOME_REF))

        self.__LOGGER.info('===========   Adding extra_provenance_refs')
        self.__LOGGER.info(str(extra_provenance_input_refs))
        self.__LOGGER.info('==========================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqExpression",
                "data":
                expression_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs":
                extra_provenance_input_refs
            }]
        })[0]

        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        print(returnVal)
        #END upload_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_expression(self, ctx, params):
        """
        Downloads expression *
        :param params: instance of type "DownloadExpressionParams" (*
           Required input parameters for downloading expression string
           source_ref         -       object reference of expression source.
           The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id *) -> structure: parameter "source_ref" of
           String
        :returns: instance of type "DownloadExpressionOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_expression

        self.__LOGGER.info('Running download_expression with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'download_' + str(timestamp))
        os.mkdir(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            expression[0]['data']['file']['id'],
            'file_path':
            output_dir,
            'unpack':
            'unpack'
        })

        if not os.listdir(output_dir):
            raise ValueError('No files were downloaded: ' + output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        returnVal = {'destination_dir': output_dir}

        #END download_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_expression(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download expressions from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting expression string   source_ref         - 
           object reference of expression source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_expression

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        output = {'shock_id': expression[0]['data']['file']['id']}

        #END export_expression

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_expression return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_expressionMatrix(self, ctx, params):
        """
        :param params: instance of type "getExprMatrixParams" (* Following
           are the required input parameters to get Expression Matrix *) ->
           structure: parameter "workspace_name" of String, parameter
           "output_obj_name" of String, parameter "expressionset_ref" of
           String
        :returns: instance of type "getExprMatrixOutput" -> structure:
           parameter "exprMatrix_FPKM_ref" of String, parameter
           "exprMatrix_TPM_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_expressionMatrix
        fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix(
            params)

        returnVal = {
            'exprMatrix_FPKM_ref': fpkm_ref,
            'exprMatrix_TPM_ref': tpm_ref
        }
        #END get_expressionMatrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_expressionMatrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 7
0
    def upload_reads(self, ctx, params):
        """
        Loads a set of reads to KBase data stores.
        :param params: instance of type "UploadReadsParams" (Input to the
           upload_reads function. Required parameters: fwd_id - the id of the
           shock node containing the reads data file: either single end
           reads, forward/left reads, or interleaved reads. sequencing_tech -
           the sequencing technology used to produce the reads. One of: wsid
           - the id of the workspace where the reads will be saved
           (preferred). wsname - the name of the workspace where the reads
           will be saved. One of: objid - the id of the workspace object to
           save over name - the name to which the workspace object will be
           saved Optional parameters: rev_id - the shock node id containing
           the reverse/right reads for paired end, non-interleaved reads.
           single_genome - whether the reads are from a single genome or a
           metagenome. Default is single genome. strain - information about
           the organism strain that was sequenced. source - information about
           the organism source. interleaved - specify that the fwd reads file
           is an interleaved paired end reads file as opposed to a single end
           reads file. Default true, ignored if rev_id is specified.
           read_orientation_outward - whether the read orientation is outward
           from the set of primers. Default is false and is ignored for
           single end reads. insert_size_mean - the mean size of the genetic
           fragments. Ignored for single end reads. insert_size_std_dev - the
           standard deviation of the size of the genetic fragments. Ignored
           for single end reads.) -> structure: parameter "fwd_id" of String,
           parameter "wsid" of Long, parameter "wsname" of String, parameter
           "objid" of Long, parameter "name" of String, parameter "rev_id" of
           String, parameter "sequencing_tech" of String, parameter
           "single_genome" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "strain" of type "StrainInfo"
           (Information about a strain. genetic_code - the genetic code of
           the strain. See
           http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
           genus - the genus of the strain species - the species of the
           strain strain - the identifier for the strain source - information
           about the source of the strain organelle - the organelle of
           interest for the related data (e.g. mitochondria) ncbi_taxid - the
           NCBI taxonomy ID of the strain location - the location from which
           the strain was collected @optional genetic_code source ncbi_taxid
           organelle location) -> structure: parameter "genetic_code" of
           Long, parameter "genus" of String, parameter "species" of String,
           parameter "strain" of String, parameter "organelle" of String,
           parameter "source" of type "SourceInfo" (Information about the
           source of a piece of data. source - the name of the source (e.g.
           NCBI, JGI, Swiss-Prot) source_id - the ID of the data at the
           source project_id - the ID of a project encompassing the data at
           the source @optional source source_id project_id) -> structure:
           parameter "source" of String, parameter "source_id" of type
           "source_id" (An ID used for a piece of data at its source. @id
           external), parameter "project_id" of type "project_id" (An ID used
           for a project encompassing a piece of data at its source. @id
           external), parameter "ncbi_taxid" of Long, parameter "location" of
           type "Location" (Information about a location. lat - latitude of
           the site, recorded as a decimal number. North latitudes are
           positive values and south latitudes are negative numbers. lon -
           longitude of the site, recorded as a decimal number. West
           longitudes are positive values and east longitudes are negative
           numbers. elevation - elevation of the site, expressed in meters
           above sea level. Negative values are allowed. date - date of an
           event at this location (for example, sample collection), expressed
           in the format YYYY-MM-DDThh:mm:ss.SSSZ description - a free text
           description of the location and, if applicable, the associated
           event. @optional date description) -> structure: parameter "lat"
           of Double, parameter "lon" of Double, parameter "elevation" of
           Double, parameter "date" of String, parameter "description" of
           String, parameter "source" of type "SourceInfo" (Information about
           the source of a piece of data. source - the name of the source
           (e.g. NCBI, JGI, Swiss-Prot) source_id - the ID of the data at the
           source project_id - the ID of a project encompassing the data at
           the source @optional source source_id project_id) -> structure:
           parameter "source" of String, parameter "source_id" of type
           "source_id" (An ID used for a piece of data at its source. @id
           external), parameter "project_id" of type "project_id" (An ID used
           for a project encompassing a piece of data at its source. @id
           external), parameter "interleaved" of type "boolean" (A boolean -
           0 for false, 1 for true. @range (0, 1)), parameter
           "read_orientation_outward" of type "boolean" (A boolean - 0 for
           false, 1 for true. @range (0, 1)), parameter "insert_size_mean" of
           Double, parameter "insert_size_std_dev" of Double
        :returns: instance of type "UploadReadsOutput" (The output of the
           upload_reads function. obj_ref - a reference to the new Workspace
           object in the form X/Y/Z, where X is the workspace ID, Y is the
           object ID, and Z is the version.) -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_reads
        self.log('Starting upload reads, parsing args')
        o, wsid, name, objid, kbtype, single_end, fwdid, revid = (
            self._proc_upload_reads_params(ctx, params))
        interleaved = 1 if (not single_end and not revid) else 0
        fileinput = [{
            'shock_id': fwdid,
            'file_path': self.scratch + '/fwd/',
            'unpack': 'uncompress'
        }]
        if revid:
            fileinput.append({
                'shock_id': revid,
                'file_path': self.scratch + '/rev/',
                'unpack': 'uncompress'
            })
        dfu = DataFileUtil(self.callback_url, token=ctx['token'])
        self.log('downloading reads files from Shock')
        files = dfu.shock_to_file_mass(fileinput)
        self.log('download complete, validating files')
        for f, i in zip(files, fileinput):
            if not self.validateFASTQ(ctx, [{
                    'file_path': f['file_path'],
                    'interleaved': interleaved
            }])[0][0]['validated']:
                raise ValueError(
                    'Invalid fasta file {} from Shock node {}'.format(
                        f['file_path'], i['shock_id']))
        self.log('file validation complete')
        self.log('coercing forward reads node to my control, muhahahaha!')
        fwdr = dfu.own_shock_node({'shock_id': fwdid, 'make_handle': 1})
        self.log('coercing complete, my evil schemes know no bounds')
        revr = None
        if revid:
            self.log('coercing reverse reads node to my control, muhahahaha!')
            revr = dfu.own_shock_node({'shock_id': revid, 'make_handle': 1})
            self.log('coercing complete. Will I stop at nothing?')

        # TODO calculate gc content, read size, read_count (find a program)
        fwdfile = {
            'file': fwdr['handle'],
            'encoding': 'ascii',
            'size': files[0]['size'],
            'type': 'fq'
        }
        if single_end:
            o['lib'] = fwdfile
        else:
            o['lib1'] = fwdfile
            if revr:
                o['lib2'] = {
                    'file': revr['handle'],
                    'encoding': 'ascii',
                    'size': files[1]['size'],
                    'type': 'fq'
                }

        so = {'type': kbtype, 'data': o}
        if name:
            so['name'] = name
        else:
            so['objid'] = objid
        self.log('saving workspace object')
        oi = dfu.save_objects({'id': wsid, 'objects': [so]})[0]
        self.log('save complete')

        returnVal = {
            'obj_ref': str(oi[6]) + '/' + str(oi[0]) + '/' + str(oi[4])
        }
        #END upload_reads

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_reads return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Exemplo n.º 8
0
class TreeUtils:
    '''
    Module Name:
    TreeUtils

    Module Description:
    
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = ""
    GIT_COMMIT_HASH = "acb216cd302c161d5b4dfb272bd4bbae44cdac28"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.utils = Utils(config)
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config['workspace-url'])
        logging.basicConfig(level=logging.INFO)
        #END_CONSTRUCTOR
        pass


    def get_trees(self, ctx, params):
        """
        :param params: instance of type "GetTreesParams" (tree_refs -
           (required) list of WS references included_fields - (optional)
           subset of tree fields to include) -> structure: parameter
           "tree_refs" of list of String, parameter "included_fields" of list
           of String
        :returns: instance of list of type "TreeData" -> structure: parameter
           "data" of type "Tree" (Data type for phylogenetic trees. @optional
           name description type tree_attributes @optional
           default_node_labels ws_refs kb_refs leaf_list) -> structure:
           parameter "name" of String, parameter "description" of String,
           parameter "type" of String, parameter "tree" of type "newick_tree"
           (Trees are represented in KBase by default in newick format
           (http://en.wikipedia.org/wiki/Newick_format) and are returned to
           you in this format by default.) -> type "tree" (A string
           representation of a phylogenetic tree.  The format/syntax of the
           string is specified by using one of the available typedefs
           declaring a particular format, such as 'newick_tree',
           'phylo_xml_tree' or 'json_tree'.  When a format is not explictily
           specified, it is possible to return trees in different formats
           depending on addtional parameters. Regardless of format, all leaf
           nodes in trees built from MSAs are indexed to a specific MSA row. 
           You can use the appropriate functionality of the API to replace
           these IDs with other KBase Ids instead. Internal nodes may or may
           not be named. Nodes, depending on the format, may also be
           annotated with structured data such as bootstrap values and
           distances.), parameter "tree_attributes" of mapping from String to
           String, parameter "default_node_labels" of mapping from type
           "node_id" to type "label", parameter "ws_refs" of mapping from
           type "node_id" to mapping from type "ref_type" (An enumeration of
           reference types for a node.  Either the one letter abreviation or
           full name can be given.  For large trees, it is strongly advised
           you use the one letter abreviations. Supported types are: g |
           genome  => genome typed object or CDS data p | protein => protein
           sequence object or CDS data, often given as the MD5 of the
           sequence n | dna     => dna sequence object or CDS data, often
           given as the MD5 of the sequence f | feature => feature object or
           CDS data) to list of type "ws_obj_id" (@id ws), parameter
           "kb_refs" of mapping from type "node_id" to mapping from type
           "ref_type" (An enumeration of reference types for a node.  Either
           the one letter abreviation or full name can be given.  For large
           trees, it is strongly advised you use the one letter abreviations.
           Supported types are: g | genome  => genome typed object or CDS
           data p | protein => protein sequence object or CDS data, often
           given as the MD5 of the sequence n | dna     => dna sequence
           object or CDS data, often given as the MD5 of the sequence f |
           feature => feature object or CDS data) to list of type "kbase_id"
           (A KBase ID is a string starting with the characters "kb|".  KBase
           IDs are typed. The types are designated using a short string. For
           instance," g" denotes a genome, "tree" denotes a Tree, and "aln"
           denotes a sequence alignment. KBase IDs may be hierarchical.  For
           example, if a KBase genome identifier is "kb|g.1234", a protein
           encoding gene within that genome may be represented as
           "kb|g.1234.peg.771". @id kb), parameter "leaf_list" of list of
           type "node_id", parameter "info" of type "object_info"
           (Information about an object, including user provided metadata.
           obj_id objid - the numerical id of the object. obj_name name - the
           name of the object. type_string type - the type of the object.
           timestamp save_date - the save date of the object. obj_ver ver -
           the version of the object. username saved_by - the user that saved
           or copied the object. ws_id wsid - the workspace containing the
           object. ws_name workspace - the workspace containing the object.
           string chsum - the md5 checksum of the object. int size - the size
           of the object in bytes. usermeta meta - arbitrary user-supplied
           metadata about the object.) -> tuple of size 11: parameter "objid"
           of type "obj_id" (The unique, permanent numerical ID of an
           object.), parameter "name" of type "obj_name" (A string used as a
           name for an object. Any string consisting of alphanumeric
           characters and the characters |._- that is not an integer is
           acceptable.), parameter "type" of type "type_string" (A type
           string. Specifies the type and its version in a single string in
           the format [module].[typename]-[major].[minor]: module - a string.
           The module name of the typespec containing the type. typename - a
           string. The name of the type as assigned by the typedef statement.
           major - an integer. The major version of the type. A change in the
           major version implies the type has changed in a non-backwards
           compatible way. minor - an integer. The minor version of the type.
           A change in the minor version implies that the type has changed in
           a way that is backwards compatible with previous type definitions.
           In many cases, the major and minor versions are optional, and if
           not provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN get_trees
        logging.info("Starting 'get_trees' with params:{}".format(params))
        self.utils.validate_params(params, ("tree_refs",), ("included_fields",))
        ws_objs = [{'ref': r, 'included': params.get('included_fields', None)}
                   for r in params['tree_refs']]
        result = self.ws.get_objects2({'objects': ws_objs})['data']
        #END get_trees

        # At some point might do deeper type checking...
        if not isinstance(result, list):
            raise ValueError('Method get_trees return value ' +
                             'result is not type list as required.')
        # return the results
        return [result]

    def save_trees(self, ctx, params):
        """
        :param params: instance of type "SaveTreesParams" -> structure:
           parameter "ws_id" of type "ws_id" (The unique, permanent numerical
           ID of a workspace.), parameter "trees" of list of type
           "ObjectSaveData" (An object and associated data required for
           saving. Required arguments: type_string type - the type of the
           object. Omit the version information to use the latest version.
           UnspecifiedObject data - the object data. Optional arguments: One
           of an object name or id. If no name or id is provided the name
           will be set to 'auto' with the object id appended as a string,
           possibly with -\d+ appended if that object id already exists as a
           name. obj_name name - the name of the object. obj_id objid - the
           id of the object to save over. usermeta meta - arbitrary
           user-supplied metadata for the object, not to exceed 16kb; if the
           object type specifies automatic metadata extraction with the 'meta
           ws' annotation, and your metadata name conflicts, then your
           metadata will be silently overwritten. list<ProvenanceAction>
           provenance - provenance data for the object. boolean hidden - true
           if this object should not be listed when listing workspace
           objects.) -> structure: parameter "type" of type "type_string" (A
           type string. Specifies the type and its version in a single string
           in the format [module].[typename]-[major].[minor]: module - a
           string. The module name of the typespec containing the type.
           typename - a string. The name of the type as assigned by the
           typedef statement. major - an integer. The major version of the
           type. A change in the major version implies the type has changed
           in a non-backwards compatible way. minor - an integer. The minor
           version of the type. A change in the minor version implies that
           the type has changed in a way that is backwards compatible with
           previous type definitions. In many cases, the major and minor
           versions are optional, and if not provided the most recent version
           will be used. Example: MyModule.MyType-3.1), parameter "data" of
           unspecified object, parameter "name" of type "obj_name" (A string
           used as a name for an object. Any string consisting of
           alphanumeric characters and the characters |._- that is not an
           integer is acceptable.), parameter "objid" of type "obj_id" (The
           unique, permanent numerical ID of an object.), parameter "meta" of
           type "usermeta" (User provided metadata about an object. Arbitrary
           key-value pairs provided by the user.) -> mapping from String to
           String, parameter "provenance" of list of type "ProvenanceAction"
           (A provenance action. A provenance action (PA) is an action taken
           while transforming one data object to another. There may be
           several PAs taken in series. A PA is typically running a script,
           running an api command, etc. All of the following fields are
           optional, but more information provided equates to better data
           provenance. resolved_ws_objects should never be set by the user;
           it is set by the workspace service when returning data. On input,
           only one of the time or epoch may be supplied. Both are supplied
           on output. The maximum size of the entire provenance object,
           including all actions, is 1MB. timestamp time - the time the
           action was started epoch epoch - the time the action was started.
           string caller - the name or id of the invoker of this provenance
           action. In most cases, this will be the same for all PAs. string
           service - the name of the service that performed this action.
           string service_ver - the version of the service that performed
           this action. string method - the method of the service that
           performed this action. list<UnspecifiedObject> method_params - the
           parameters of the method that performed this action. If an object
           in the parameters is a workspace object, also put the object
           reference in the input_ws_object list. string script - the name of
           the script that performed this action. string script_ver - the
           version of the script that performed this action. string
           script_command_line - the command line provided to the script that
           performed this action. If workspace objects were provided in the
           command line, also put the object reference in the input_ws_object
           list. list<obj_ref> input_ws_objects - the workspace objects that
           were used as input to this action; typically these will also be
           present as parts of the method_params or the script_command_line
           arguments. list<obj_ref> resolved_ws_objects - the workspace
           objects ids from input_ws_objects resolved to permanent workspace
           object references by the workspace service. list<string>
           intermediate_incoming - if the previous action produced output
           that 1) was not stored in a referrable way, and 2) is used as
           input for this action, provide it with an arbitrary and unique ID
           here, in the order of the input arguments to this action. These
           IDs can be used in the method_params argument. list<string>
           intermediate_outgoing - if this action produced output that 1) was
           not stored in a referrable way, and 2) is used as input for the
           next action, provide it with an arbitrary and unique ID here, in
           the order of the output values from this action. These IDs can be
           used in the intermediate_incoming argument in the next action.
           list<ExternalDataUnit> external_data - data external to the
           workspace that was either imported to the workspace or used to
           create a workspace object. list<SubAction> subactions - the
           subactions taken as a part of this action. mapping<string, string>
           custom - user definable custom provenance fields and their values.
           string description - a free text description of this action.) ->
           structure: parameter "time" of type "timestamp" (A time in the
           format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z
           (representing the UTC timezone) or the difference in time to UTC
           in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time)
           2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC
           time)), parameter "epoch" of type "epoch" (A Unix epoch (the time
           since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller"
           of String, parameter "service" of String, parameter "service_ver"
           of String, parameter "method" of String, parameter "method_params"
           of list of unspecified object, parameter "script" of String,
           parameter "script_ver" of String, parameter "script_command_line"
           of String, parameter "input_ws_objects" of list of type "obj_ref"
           (A string that uniquely identifies an object in the workspace
           service. There are two ways to uniquely identify an object in one
           string: "[ws_name or id]/[obj_name or id]/[obj_ver]" - for
           example, "MyFirstWorkspace/MyFirstObject/3" would identify the
           third version of an object called MyFirstObject in the workspace
           called MyFirstWorkspace. 42/Panic/1 would identify the first
           version of the object name Panic in workspace with id 42.
           Towel/1/6 would identify the 6th version of the object with id 1
           in the Towel workspace. "kb|ws.[ws_id].obj.[obj_id].ver.[obj_ver]"
           - for example, "kb|ws.23.obj.567.ver.2" would identify the second
           version of an object with id 567 in a workspace with id 23. In all
           cases, if the version number is omitted, the latest version of the
           object is assumed.), parameter "resolved_ws_objects" of list of
           type "obj_ref" (A string that uniquely identifies an object in the
           workspace service. There are two ways to uniquely identify an
           object in one string: "[ws_name or id]/[obj_name or id]/[obj_ver]"
           - for example, "MyFirstWorkspace/MyFirstObject/3" would identify
           the third version of an object called MyFirstObject in the
           workspace called MyFirstWorkspace. 42/Panic/1 would identify the
           first version of the object name Panic in workspace with id 42.
           Towel/1/6 would identify the 6th version of the object with id 1
           in the Towel workspace. "kb|ws.[ws_id].obj.[obj_id].ver.[obj_ver]"
           - for example, "kb|ws.23.obj.567.ver.2" would identify the second
           version of an object with id 567 in a workspace with id 23. In all
           cases, if the version number is omitted, the latest version of the
           object is assumed.), parameter "intermediate_incoming" of list of
           String, parameter "intermediate_outgoing" of list of String,
           parameter "external_data" of list of type "ExternalDataUnit" (An
           external data unit. A piece of data from a source outside the
           Workspace. On input, only one of the resource_release_date or
           resource_release_epoch may be supplied. Both are supplied on
           output. string resource_name - the name of the resource, for
           example JGI. string resource_url - the url of the resource, for
           example http://genome.jgi.doe.gov string resource_version -
           version of the resource timestamp resource_release_date - the
           release date of the resource epoch resource_release_epoch - the
           release date of the resource string data_url - the url of the
           data, for example
           http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf?
           organism=BlaspURHD0036 string data_id - the id of the data, for
           example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a
           free text description of the data.) -> structure: parameter
           "resource_name" of String, parameter "resource_url" of String,
           parameter "resource_version" of String, parameter
           "resource_release_date" of type "timestamp" (A time in the format
           YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z
           (representing the UTC timezone) or the difference in time to UTC
           in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time)
           2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC
           time)), parameter "resource_release_epoch" of type "epoch" (A Unix
           epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.),
           parameter "data_url" of String, parameter "data_id" of String,
           parameter "description" of String, parameter "subactions" of list
           of type "SubAction" (Information about a subaction that is invoked
           by a provenance action. A provenance action (PA) may invoke
           subactions (SA), e.g. calling a separate piece of code, a service,
           or a script. In most cases these calls are the same from PA to PA
           and so do not need to be listed in the provenance since providing
           information about the PA alone provides reproducibility. In some
           cases, however, SAs may change over time, such that invoking the
           same PA with the same parameters may produce different results.
           For example, if a PA calls a remote server, that server may be
           updated between a PA invoked on day T and another PA invoked on
           day T+1. The SubAction structure allows for specifying information
           about SAs that may dynamically change from PA invocation to PA
           invocation. string name - the name of the SA. string ver - the
           version of SA. string code_url - a url pointing to the SA's
           codebase. string commit - a version control commit ID for the SA.
           string endpoint_url - a url pointing to the access point for the
           SA - a server url, for instance.) -> structure: parameter "name"
           of String, parameter "ver" of String, parameter "code_url" of
           String, parameter "commit" of String, parameter "endpoint_url" of
           String, parameter "custom" of mapping from String to String,
           parameter "description" of String, parameter "hidden" of type
           "boolean" (A boolean. 0 = false, other = true.)
        :returns: instance of list of type "object_info" (Information about
           an object, including user provided metadata. obj_id objid - the
           numerical id of the object. obj_name name - the name of the
           object. type_string type - the type of the object. timestamp
           save_date - the save date of the object. obj_ver ver - the version
           of the object. username saved_by - the user that saved or copied
           the object. ws_id wsid - the workspace containing the object.
           ws_name workspace - the workspace containing the object. string
           chsum - the md5 checksum of the object. int size - the size of the
           object in bytes. usermeta meta - arbitrary user-supplied metadata
           about the object.) -> tuple of size 11: parameter "objid" of type
           "obj_id" (The unique, permanent numerical ID of an object.),
           parameter "name" of type "obj_name" (A string used as a name for
           an object. Any string consisting of alphanumeric characters and
           the characters |._- that is not an integer is acceptable.),
           parameter "type" of type "type_string" (A type string. Specifies
           the type and its version in a single string in the format
           [module].[typename]-[major].[minor]: module - a string. The module
           name of the typespec containing the type. typename - a string. The
           name of the type as assigned by the typedef statement. major - an
           integer. The major version of the type. A change in the major
           version implies the type has changed in a non-backwards compatible
           way. minor - an integer. The minor version of the type. A change
           in the minor version implies that the type has changed in a way
           that is backwards compatible with previous type definitions. In
           many cases, the major and minor versions are optional, and if not
           provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN save_trees
        logging.info("Starting 'save_trees'")
        self.utils.validate_params(params, ("ws_id", "trees"), ('type',))
        trees = []
        for i, t in enumerate(params['trees']):
            self.utils.validate_params(t, ("data",), ("name", "hidden", "meta", "type"))
            if 'type' in t and t['type'] != 'KBaseTrees.Tree':
                raise ValueError("This method only saves KBaseTrees.Tree objects")
            if "tree" not in t['data']:
                raise ValueError("Object {} missing 'tree' attribute containing newick tree"
                                 .format(i))
            if not Utils.validate_newick(t['data']['tree']):
                raise ValueError("Object {} has an invalid newick tree: {}"
                                 .format(i, t['data']['tree']))

            t['type'] = 'KBaseTrees.Tree'
            trees.append(t)

        result = self.dfu.save_objects({"id": params["ws_id"], "objects": trees})
        #END save_trees

        # At some point might do deeper type checking...
        if not isinstance(result, list):
            raise ValueError('Method save_trees return value ' +
                             'result is not type list as required.')
        # return the results
        return [result]

    def tree_to_newick_file(self, ctx, params):
        """
        :param params: instance of type "TreeToNewickFileParams" ->
           structure: parameter "input_ref" of type "Tree_id" (@id kb
           KBaseTrees.Tree), parameter "destination_dir" of String
        :returns: instance of type "TreeToNewickFileOutput" -> structure:
           parameter "file_path" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN tree_to_newick_file
        logging.info("Starting 'tree_to_newick' with params: {}".format(params))
        self.utils.validate_params(params, ("destination_dir", "input_ref"))
        _, result = self.utils.to_newick(params)
        #END tree_to_newick_file

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method tree_to_newick_file return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]

    def export_tree_newick(self, ctx, params):
        """
        :param params: instance of type "ExportTreeParams" -> structure:
           parameter "input_ref" of type "Tree_id" (@id kb KBaseTrees.Tree)
        :returns: instance of type "ExportTreeOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN export_tree_newick
        logging.info("Starting 'export_tree_newick' with params:{}".format(params))
        self.utils.validate_params(params, ("input_ref",))
        params['destination_dir'] = self.scratch
        cs_id, files = self.utils.to_newick(params)
        result = self.utils.export(files['file_path'], cs_id, params['input_ref'])
        #END export_tree_newick

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method export_tree_newick return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
Exemplo n.º 9
0
class GenericsUtil:
    def _validate_fetch_data_params(self, params):
        """
        _validate_fetch_data_params:
            validates params passed to fetch_data method
        """

        log('start validating fetch_data params')

        # check for required parameters
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_import_matrix_from_excel_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        log('start validating import_matrix_from_excel params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in MATRIX_TYPE:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        refs_key = [
            'col_conditionset_ref', 'row_conditionset_ref', 'genome_ref',
            'diff_expr_matrix_ref'
        ]
        refs = {k: v for k, v in params.items() if k in refs_key}

        return (obj_type, file_path, params.get('workspace_name'),
                params.get('matrix_name'), refs)

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        log('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {'file_path': file_path, 'pack': 'zip'}
        shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id')

        return shock_id

    def _upload_dir_to_shock(self, directory):
        """
        _upload_dir_to_shock: upload target dir to shock using DataFileUtil
        """
        log('Start uploading directory to shock: {}'.format(directory))

        file_to_shock_params = {'file_path': directory, 'pack': 'zip'}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_html_string(self, df):
        """
        _generate_html_string: generating a html string from df
        template used: https://developers.google.com/chart/interactive/docs/gallery/table
                       https://developers.google.com/chart/interactive/docs/reference#formatters
        """
        dtypes = df.dtypes
        columns = df.columns

        column_str = ''
        number_columns = []
        for idx, column in enumerate(columns):
            dtype = dtypes[idx].name
            if 'int' in dtype or 'float' in dtype:
                column_str += "data.addColumn('number', '{}')\n".format(column)
                number_columns.append(column)
            else:
                column_str += "data.addColumn('string', '{}')\n".format(column)

        data_str = "data.addRows({})".format(df.values.tolist())

        formatter_str = ''
        for number_column in number_columns:
            mean = round(df[number_column].mean(), 2)
            column_n = columns.tolist().index(number_column)
            formatter_str += "var formatter_{} = ".format(column_n)
            formatter_str += "new google.visualization.BarFormat({base: "
            formatter_str += str(mean)
            formatter_str += ", width: 120});\n"
            formatter_str += "formatter_{}.format(data, {});\n".format(
                column_n, column_n)

        return column_str, data_str, formatter_str

    def _find_between(self, s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_type_spec(self, obj_type):
        """
        _find_type_spec: find body spec of type
        """
        obj_type_name = self._find_between(obj_type, '\.', '\-')

        type_info = self.wsClient.get_type_info(obj_type)
        type_spec = type_info.get('spec_def')

        type_spec_list = type_spec.split(obj_type_name + ';')
        obj_type_spec = type_spec_list[0].split('structure')[-1]
        log('Found spec for {}\n{}\n'.format(obj_type, obj_type_spec))

        return obj_type_spec

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')

        constraints = {'contains': [], 'rowsum': [], 'unique': []}

        unique = [
            item.split('\n')[0].strip()
            for item in type_desc.split('@unique')[1:]
        ]
        constraints['unique'] = unique

        contains = [
            item.split('\n')[0].strip()
            for item in type_desc.split('@contains')[1:]
        ]
        constraints['contains'] = contains

        return constraints

    def _find_generics_type(self, obj_type):
        """
        _find_generics_type: try to find generics type in an object
        """

        log('Start finding generics type and name')

        obj_type_spec = self._find_type_spec(obj_type)

        if not obj_type_spec:
            raise ValueError('Cannot retrieve spec for: {}'.format(obj_type))

        generics_types = [
            generics_type for generics_type in GENERICS_TYPE
            if generics_type in obj_type_spec
        ]

        if not generics_types:
            error_msg = 'Cannot find generics type in spec:\n{}\n'.format(
                obj_type_spec)
            raise ValueError(error_msg)

        generics_module = dict()
        for generics_type in generics_types:
            for item in obj_type_spec.split(generics_type)[1:]:
                generics_type_name = item.split(';')[0].strip().split(
                    ' ')[-1].strip()
                generics_module.update({generics_type_name: generics_type})

        log('Found generics type:\n{}\n'.format(generics_module))

        return generics_module

    def _convert_data(self, data, generics_module):
        """
        _convert_data: convert data to df based on data_type
        """

        data_types = generics_module.values()

        if not set(GENERICS_TYPE) >= set(data_types):
            raise ValueError(
                'Found unknown generics data type in:\n{}\n'.format(
                    data_types))

        if data_types == ['FloatMatrix2D']:
            key = generics_module.keys()[generics_module.values().index(
                'FloatMatrix2D')]
            values = data[key]['values']
            index = data[key]['row_ids']
            columns = data[key]['col_ids']
            df = pd.DataFrame(values, index=index, columns=columns)
        # elif 'FloatMatrix2D' in data_types:  # default case
        #     key = generics_module.keys()[generics_module.values().index('FloatMatrix2D')]
        #     values = data[key]['values']
        #     index = data[key]['row_ids']
        #     columns = data[key]['col_ids']
        #     df = pd.DataFrame(values, index=index, columns=columns)
        else:
            raise ValueError('Unexpected Error')

        return df.to_json()

    def _retrieve_data(self, obj_ref, generics_module=None):
        """
        _retrieve_data: retrieve object data and return a dataframe in json format
        """
        log('Start retrieving data')
        obj_source = self.dfu.get_objects({"object_refs":
                                           [obj_ref]})['data'][0]

        obj_info = obj_source.get('info')
        obj_data = obj_source.get('data')

        if not generics_module:
            generics_module = self._find_generics_type(obj_info[2])

        try:
            data = {
                k: v
                for k, v in obj_data.items() if k in generics_module.keys()
            }
        except KeyError:
            raise ValueError('Retrieved wrong generics type name')

        data_matrix = self._convert_data(data, generics_module)

        return data_matrix

    def _get_col_cond_list(self, col_mapping, col_conditionset_ref, cols):
        """
        _get_col_cond_list: generate col condition list for excel
        """
        col_cond_list = []

        conditionset_data = self.dfu.get_objects(
            {"object_refs": [col_conditionset_ref]})['data'][0]['data']
        col_condition_names = [
            factor.get('factor') for factor in conditionset_data.get('factors')
        ]
        for col in cols:
            condition_id = col_mapping.get(col)
            if condition_id:
                col_cond_list.append(
                    conditionset_data.get('conditions').get(condition_id))
            else:
                col_cond_list.append([''] * len(col_condition_names))

        col_cond_list = map(list, zip(*col_cond_list))
        for idx, col_array in enumerate(col_cond_list):
            col_array.insert(0, col_condition_names[idx])

        return col_cond_list

    def _get_row_cond_list(self, row_mapping, row_conditionset_ref, rows):
        """
        _get_row_cond_list: generate row condition list for excel
        """
        row_cond_list = []

        conditionset_data = self.dfu.get_objects(
            {"object_refs": [row_conditionset_ref]})['data'][0]['data']
        row_condition_names = [
            factor.get('factor') for factor in conditionset_data.get('factors')
        ]

        row_cond_list.append(row_condition_names)

        for row in rows:
            condition_id = row_mapping.get(row)
            if condition_id:
                row_cond_list.append(
                    conditionset_data.get('conditions').get(condition_id))
            else:
                row_cond_list.append([''] * len(row_condition_names))

        return row_cond_list

    def _get_data_list(self, cols, rows, values):
        """
        _get_data_list: generate data value list for excel
        """
        data_arrays = []
        cols.insert(0, '')
        data_arrays.append(cols)
        for idx, row in enumerate(rows):
            values[idx].insert(0, row)
        data_arrays += values

        return data_arrays

    def _merge_cond_list(self, excel_list, col_cond_list, row_cond_list):
        """
        _merge_cond_list: merge lists for excel
        """
        col_cond_len = len(col_cond_list)
        for item in excel_list[:col_cond_len]:
            row_len = len(row_cond_list[0]) if row_cond_list else 0
            item[0:0] = [''] * row_len

        if row_cond_list:
            for idx, item in enumerate(excel_list[col_cond_len:]):
                item[0:0] = row_cond_list[idx]

    def _is_number(s):
        """
        _is_number: string is a numeric
        """
        try:
            float(s)
            return True
        except ValueError:
            pass

        return False

    def _gen_excel(self, excel_list, obj_name):
        """
        _gen_excel: create excel
        """

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name))

        log('Start writing to file: {}'.format(file_path))

        workbook = xlsxwriter.Workbook(file_path, {'nan_inf_to_errors': True})
        worksheet = workbook.add_worksheet()

        row = 1
        for data_entry in excel_list:
            for idx, cell_data in enumerate(data_entry):
                worksheet.write(row, idx, cell_data)

            row += 1

        workbook.close()

        return file_path

    def _write_mapping_sheet(self, file_path, sheet_name, mapping, index):
        """
        _write_mapping_sheet: write mapping to sheet
        """
        df_dict = collections.OrderedDict()

        df_dict[index[0]] = []
        df_dict[index[1]] = []

        for key, value in mapping.items():
            df_dict.get(index[0]).append(key)
            df_dict.get(index[1]).append(value)

        df = pd.DataFrame.from_dict(df_dict)

        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            writer.book = load_workbook(file_path)
            df.to_excel(writer, sheet_name=sheet_name)

    def _filter_constraints(self, constraints, data):

        contains_constraints = constraints.get('contains')

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint.split(' ')[1:]
            missing_key = True
            for in_value in in_values:
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                             in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)

        return constraints

    def _retrieve_value(self, data, value):
        log('Getting value for {}'.format(value))
        retrieve_data = []
        m_data = DotMap(data)
        if value.startswith(
                'values'):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                     value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = m_data_cp.values()
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2(
                    {'objects': [{
                        'ref': obj_ref,
                        'included': [included]
                    }]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [
                            x.get(keys[2]) for x in ref_data.get(keys[0])
                        ]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        log('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20]))

        return retrieve_data

    def _generate_report(self, matrix_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        report_params = {
            'message':
            '',
            'objects_created': [{
                'ref': matrix_obj_ref,
                'description': 'Imported Matrix'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'import_matrix_from_excel_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """

        validated = True
        failed_constraints = {'contains': [], 'rowsum': [], 'unique': []}

        unique_constraints = constraints.get('unique')
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint)
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint)

        contains_constraints = constraints.get('contains')
        for contains_constraint in contains_constraints:
            value = contains_constraint.split(' ')[0]
            in_values = contains_constraint.split(' ')[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <=
                    set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(contains_constraint)

        return validated, failed_constraints

    def _process_mapping_sheet(self, file_path, sheet_name):
        """
        _process_mapping: process mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return dict()
        else:
            mapping = {value[0]: value[1] for value in df.values.tolist()}

        return mapping

    def _process_conditionset_sheet(self, file_path, sheet_name, matrix_name,
                                    workspace_id):
        """
        _process_conditionset_sheet: process condition set sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return ''
        else:
            obj_name = '{}_{}'.format(sheet_name, matrix_name)
            result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            self._mkdir_p(result_directory)
            file_path = os.path.join(result_directory,
                                     '{}.xlsx'.format(obj_name))
            df.to_excel(file_path)
            import_condition_set_params = {
                'output_obj_name': obj_name,
                'output_ws_id': workspace_id,
                'input_file_path': file_path
            }

            ref = self.cu.file_to_condition_set(import_condition_set_params)

            return ref.get('condition_set_ref')

    def _file_to_data(self, file_path, refs, matrix_name, workspace_id):
        log('Start reading and converting excel file data')
        data = refs

        try:
            pd.read_excel(file_path)
        except XLRDError:
            # TODO: convert csv file to excel
            log('Found csv file')
            raise ValueError('Please provide .xlsx file only')

        # processing data sheet
        try:
            df = pd.read_excel(file_path, sheet_name='data')
        except XLRDError:
            raise ValueError('Cannot find <data> sheetss')
        else:
            df.fillna(0, inplace=True)
            matrix_data = {
                'row_ids': df.index.tolist(),
                'col_ids': df.columns.tolist(),
                'values': df.values.tolist()
            }

            data.update({'data': matrix_data})

        # processing col/row_mapping
        col_mapping = self._process_mapping_sheet(file_path, 'col_mapping')
        data.update({'col_mapping': col_mapping})

        row_mapping = self._process_mapping_sheet(file_path, 'row_mapping')
        data.update({'row_mapping': row_mapping})

        # processing col/row_conditionset
        col_conditionset_ref = self._process_conditionset_sheet(
            file_path, 'col_conditionset', matrix_name, workspace_id)
        data.update({'col_conditionset_ref': col_conditionset_ref})

        row_conditionset_ref = self._process_conditionset_sheet(
            file_path, 'row_conditionset', matrix_name, workspace_id)
        data.update({'row_conditionset_ref': row_conditionset_ref})

        # processing metadata
        metadata = self._process_mapping_sheet(file_path, 'metadata')
        data.update(metadata)

        return data

    def _build_header_str(self, factor_names):

        header_str = ''
        width = 100.0 / len(factor_names)

        header_str += '<tr class="header">'
        header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format(
            width)

        for factor_name in factor_names:
            header_str += '<th style="width:{0:.2f}%;"'.format(width)
            header_str += '>{}</th>'.format(factor_name)
        header_str += '</tr>'

        return header_str

    def _build_html_str(self, row_mapping, conditionset_data, row_ids):

        log('Start building html replacement')

        factor_names = [
            factor.get('factor') for factor in conditionset_data.get('factors')
        ]

        header_str = self._build_header_str(factor_names)

        table_str = ''

        conditions = conditionset_data.get('conditions')

        for feature_id, factor_id in row_mapping.items():
            if feature_id in row_ids:
                feature_conditions = conditions.get(factor_id)

                table_str += '<tr>'
                table_str += '<td>{}</td>'.format(feature_id)

                for feature_condition in feature_conditions:
                    table_str += '<td>{}</td>'.format(feature_condition)
                table_str += '</tr>'

        return header_str, table_str

    def _generate_search_html_report(self, header_str, table_str):

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        shutil.copy2(os.path.join(os.path.dirname(__file__), 'kbase_icon.png'),
                     output_directory)
        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'search_icon.png'),
            output_directory)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'search_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '//HEADER_STR', header_str)
                report_template = report_template.replace(
                    '//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Search Matrix App'
        })

        return html_report

    def _generate_search_report(self, header_str, table_str, workspace_name):
        log('Start creating report')

        output_html_files = self._generate_search_html_report(
            header_str, table_str)

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name':
            'kb_matrix_filter_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _filter_value_data(self, value_data, feature_ids):

        filtered_value_data = dict()
        filtered_value_data['col_ids'] = value_data['col_ids']

        feature_ids = feature_ids.split(',')

        filtered_value_data['row_ids'] = feature_ids
        filtered_value_data['values'] = list()

        values = value_data['values']
        row_ids = value_data['row_ids']
        for feature_id in feature_ids:
            idx = row_ids.index(feature_id)
            filtered_value_data['values'].append(values[idx])

        return filtered_value_data

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.cu = ConditionUtils(self.callback_url, service_ver="dev")

    def filter_matrix(self, params):
        """
        filter_matrix: create sub-matrix based on input feature_ids or group by factor name

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        feature_ids: string of feature ids that result matrix contains
        filtered_matrix_name: name of newly created filtered matrix object
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')
        feature_ids = params.get('feature_ids')
        filtered_matrix_name = params.get('filtered_matrix_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_info = matrix_source.get('info')
        matrix_data = matrix_source.get('data')

        matrix_type = self._find_between(matrix_info[2], '\.', '\-')

        value_data = matrix_data.get('data')
        filtered_value_data = self._filter_value_data(value_data, feature_ids)
        matrix_data['data'] = filtered_value_data

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        filtered_matrix_obj_ref = self.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(matrix_type),
            'obj_name':
            filtered_matrix_name,
            'data':
            matrix_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]}

        report_output = self._generate_report(filtered_matrix_obj_ref,
                                              workspace_name)

        returnVal.update(report_output)

        return returnVal

    def search_matrix(self, params):
        """
        search_matrix: generate a HTML report that allows users to select feature ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_data = matrix_source.get('data')

        row_mapping = matrix_data.get('row_mapping')
        row_conditionset_ref = matrix_data.get('row_conditionset_ref')

        row_ids = matrix_data['data']['row_ids']

        if not (row_mapping and row_conditionset_ref):
            raise ValueError(
                'Matrix obejct is missing either row_mapping or row_conditionset_ref'
            )

        conditionset_data = self.dfu.get_objects(
            {"object_refs": [row_conditionset_ref]})['data'][0]['data']

        header_str, table_str = self._build_html_str(row_mapping,
                                                     conditionset_data,
                                                     row_ids)

        returnVal = self._generate_search_report(header_str, table_str,
                                                 workspace_name)

        return returnVal

    def import_matrix_from_excel(self, params):
        """
        import_matrix_from_excel: import matrix object from excel

        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_conditionset_ref: column ConditionSet reference
        row_conditionset_ref: row ConditionSet reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (obj_type, file_path, workspace_name, matrix_name,
         refs) = self._validate_import_matrix_from_excel_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path, refs, matrix_name, workspace_id)

        matrix_obj_ref = self.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        log('Starting saving object')

        obj_type = params.get('obj_type')

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({
            'mod': module_name
        }).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').iteritems() if v)
        validate = self.validate_data({'obj_type': obj_type, 'data': data})

        if not validate.get('validated'):
            log('Data failed type checking')
            failed_constraints = validate.get('failed_constraints')
            error_msg = 'Object {} failed type checking:\n'.format(
                params.get('obj_name'))
            if failed_constraints.get('unique'):
                unique_values = failed_constraints.get('unique')
                error_msg += 'Object should have unique field: {}\n'.format(
                    unique_values)
            if failed_constraints.get('contains'):
                contained_values = failed_constraints.get('contains')
                for contained_value in contained_values:
                    subset_value = contained_value.split(' ')[0]
                    super_value = ' '.join(contained_value.split(' ')[1:])
                    error_msg += 'Object field [{}] should contain field [{}]\n'.format(
                        super_value, subset_value)
            raise ValueError(error_msg)

        workspace_name = params.get('workspace_name')
        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": data,
                "name": params.get('obj_name')
            }]
        })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        returnVal = {
            'validated': validated,
            'failed_constraints': failed_constraints
        }

        return returnVal

    def generate_matrix_html(self, params):
        """
        generate_matrix_html: generate a html page for given data

        arguments:
        df: a pandas dataframe

        return:
        html_string: html as a string format
        """

        column_str, data_str, formatter_str = self._generate_html_string(
            params.get('df'))

        with open(
                os.path.join(os.path.dirname(__file__),
                             'matrix_page_template.html'),
                'r') as matrix_page_template_file:
            html_string = matrix_page_template_file.read()
            html_string = html_string.replace('// ADD_COL', column_str)
            html_string = html_string.replace('// ADD_DATA', data_str)
            html_string = html_string.replace('// ADD_FORMATTER',
                                              formatter_str)

        returnVal = {'html_string': html_string}

        return returnVal

    def fetch_data(self, params):
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """

        log('--->\nrunning GenericsUtil.fetch_data\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_fetch_data_params(params)

        try:
            data_matrix = self._retrieve_data(params.get('obj_ref'),
                                              params.get('generics_module'))
        except Exception:
            error_msg = 'Running fetch_data returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Please try to specify generics type and name as generics_module\n'
            raise ValueError(error_msg)

        returnVal = {'data_matrix': data_matrix}

        return returnVal

    def export_matrix(self, params):
        """
        export_matrix: univeral downloader for matrix data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: select the generics data to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        and only data is needed
                        generics_module should be
                        {'data': 'FloatMatrix2D'}
        """
        log('Start exporting matrix')

        if 'input_ref' in params:
            params['obj_ref'] = params.pop('input_ref')

        obj_source = self.dfu.get_objects(
            {"object_refs": [params.get('obj_ref')]})['data'][0]
        obj_data = obj_source.get('data')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory,
                                 '{}.xlsx'.format(obj_source.get('info')[1]))

        data_matrix = self.fetch_data(params).get('data_matrix')
        df = pd.read_json(data_matrix)

        df.to_excel(file_path, sheet_name='data')

        if obj_data.get('col_mapping'):
            self._write_mapping_sheet(file_path, 'col_mapping',
                                      obj_data.get('col_mapping'),
                                      ['col_name', 'condition_name'])
            obj_data.pop('col_mapping')

        if obj_data.get('row_mapping'):
            self._write_mapping_sheet(file_path, 'row_mapping',
                                      obj_data.get('row_mapping'),
                                      ['row_name', 'condition_name'])
            obj_data.pop('row_mapping')

        try:
            obj_data.pop('data')
        except KeyError:
            log('Missing key [data]')

        self._write_mapping_sheet(file_path, 'metadata', obj_data,
                                  ['name', 'value'])

        shock_id = self._upload_to_shock(file_path)

        return {'shock_id': shock_id}
Exemplo n.º 10
0
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of type "get_promoter_for_gene_output_params" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs

        #TODO: have these guys return output paths
        for key, value in params.iteritems():
            print key
        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']
        promoterFastaFilePath = self.get_promoter_for_gene(ctx, params)[0]

        gibbsCommandList = []
        for i in range(motMin, motMax + 1, 2):
            gibbsCommandList.append(
                GU.build_gibbs_command(promoterFastaFilePath, i))

        for g in gibbsCommandList:
            GU.run_gibbs_command(g)
        #gibbsCommand = GU.build_gibbs_command(promoterFastaFilePath)
        #GU.run_gibbs_command(gibbsCommand)
        #print(promoterFastaFilePath)
        homerMotifCommand = HU.build_homer_motif_command(promoterFastaFilePath)
        homerLocationCommand = HU.build_homer_location_command(
            promoterFastaFilePath)
        os.mkdir(self.shared_folder + '/homer_out')
        #print(homerMotifCommand)
        HU.run_homer_command(homerMotifCommand)
        HU.run_homer_command(homerLocationCommand)

        MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath)
        MEU.run_meme_command(MEMEMotifCommand)

        gibbsMotifList = GU.parse_gibbs_output(motMin, motMax)
        homerMotifList = HU.parse_homer_output()
        memeMotifList = MEU.parse_meme_output()

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        lineCount = 0
        with open(promoterFastaFilePath, 'r') as pFile:
            for line in pFile:
                lineCount += 1
        numFeat = lineCount / 2
        with open(promoterFastaFilePath, 'r') as pFile:
            fileStr = pFile.read()
        promHtmlStr = '<html><body> ' + fileStr + ' </body></html>'
        with open(htmlDir + '/promoters.html', 'w') as promHTML:
            promHTML.write(promHtmlStr)
        JsonPath = '/kb/module/work/tmp'
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/gibbs.json', htmlDir + '/gibbs.html',
            str(numFeat)
        ])
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/homer_out/homer.json', htmlDir + '/homer.html',
            str(numFeat)
        ])
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/meme_out/meme.json', htmlDir + '/meme.html',
            str(numFeat)
        ])
        fullMotifList = []
        for h in homerMotifList:
            add = True
            for g in gibbsMotifList:
                if h['Iupac_signature'] == g['Iupac_signature']:
                    add = False
                    break
            for m in memeMotifList:
                if m['Iupac_signature'] == h['Iupac_signature']:
                    add = False
                    break
            if add:
                fullMotifList.append(h)
        for g in gibbsMotifList:
            add = True
            for m in memeMotifList:
                if m['Iupac_signature'] == g['Iupac_signature']:
                    add = False
                    break
                if add:
                    fullMotifList.append(g)
        for m in memeMotifList:
            fullMotifList.append(m)

        #What needs to happen here:
        #call makeLogo for each of the json outputs(capture these from somewhere)
        dfu = DataFileUtil(self.callback_url)
        parsed = ['gibbs.html', 'homer.html', 'meme.html', 'promoters.html']
        indexHtmlStr = '<html>'
        #use js to load the page content
        for p in parsed:
            indexHtmlStr += '<head><script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script> <script> $(function(){$("#' + p.replace(
                '.html', '_content') + '").load("' + p + '"); });</script> '
        indexHtmlStr += """<style>
            body {font-family: Arial;}

            /* Style the tab */
            .tab {
            overflow: hidden;
    border: 1px solid #ccc;
    background-color: #f1f1f1;
}

/* Style the buttons inside the tab */
.tab button {
    background-color: inherit;
    float: left;
    border: none;
    outline: none;
    cursor: pointer;
    padding: 14px 16px;
    transition: 0.3s;
    font-size: 17px;
}

/* Change background color of buttons on hover */
.tab button:hover {
    background-color: #ddd;
}

/* Create an active/current tablink class */
.tab button.active {
    background-color: #ccc;
}

/* Style the tab content */
.tabcontent {
    display: none;
    padding: 6px 12px;
    border: 1px solid #ccc;
    border-top: none;
}
</style></head> """
        indexHtmlStr += '<body>'
        #adding tabs
        indexHtmlStr += '<div class="tab">\n'
        for p in parsed:
            indexHtmlStr += '<button class="tablinks" onclick="openReport(event, \'' + p.replace(
                '.html', '_content') + '\')">' + p.replace('.html',
                                                           '') + '</button>'
        indexHtmlStr += '</div>'
        for p in parsed:
            indexHtmlStr += '<div id="' + p.replace(
                '.html', '_content') + '" class="tabcontent"></div>'
        indexHtmlStr += """<script>
function openReport(evt, reportName) {
    var i, tabcontent, tablinks;
    tabcontent = document.getElementsByClassName("tabcontent");
    for (i = 0; i < tabcontent.length; i++) {
        tabcontent[i].style.display = "none";
    }
    tablinks = document.getElementsByClassName("tablinks");
    for (i = 0; i < tablinks.length; i++) {
        tablinks[i].className = tablinks[i].className.replace(" active", "");
    }
    document.getElementById(reportName).style.display = "block";
    evt.currentTarget.className += " active";
}
</script>"""

        #for p in parsed:
        #    indexHtmlStr += '<a href="' + p + '">' + p.replace('.html','') +' Output</a>\n'
        #indexHtmlStr += '</body></html>'
        with open(htmlDir + '/index.html', 'w') as html_handle:
            html_handle.write(str(indexHtmlStr))

        #plt.rcParams['figure.dpi'] = 300

        #htmlFiles = ['index.html','gibbs.html','homer.html']
        #shockParamsList = []
        #for f in htmlFiles:
        #    shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'})

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        MSO = {}
        MSO['Condition'] = 'Temp'
        MSO['FeatureSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A', 'C', 'G', 'T']
        MSO['Background'] = {}
        for letter in MSO['Alphabet']:
            MSO['Background'][letter] = 0.0

        MSU.parseMotifList(fullMotifList, MSO)
        objname = 'MotifSet' + str(
            int((datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000))

        #Pass motif set into this
        save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGwasData.MotifSet',
            'data': MSO,
            'name': objname
        }]

        info = dfu.save_objects(save_objects_params)[0]
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref':
                motif_set_ref,
                'description':
                'Motif Set generated by identify promoter'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 11
0
class variation_importer_utils:
    def __init__(self, utility_params):
        self.params = utility_params
        # self.scratch = utility_params['scratch']
        self.scratch = os.path.join(utility_params['scratch'],
                                    'variation_importer_' + str(uuid.uuid4()))
        os.mkdir(self.scratch)
        self.service_wiz_url = utility_params['srv-wiz-url']
        self.callback_url = utility_params['callback_url']

        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url,
                               token=utility_params['token'])

    def _create_fake_location_data(self):
        location = {
            'lat':
            random.uniform(-90, 90),
            'lon':
            random.uniform(-180, 180),
            'elevation':
            random.uniform(0, 100),
            'description':
            "".join([random.choice(string.ascii_letters) for n in xrange(20)])
        }
        return location

    def _create_fake_straininfo(self, genotype_id):
        straininfo = {
            'source_id': genotype_id,
            'location_info': self._create_fake_location_data()
        }
        return straininfo

    def _create_fake_population(self, genotypes):
        population = {'description': 'Faker population data.', 'strains': []}
        for genome in genotypes:
            population['strains'].append(self._create_fake_straininfo(genome))
        return population

    def _create_fake_kinship_matrix(self):
        kinship = {
            'row_ids': ['one', 'two'],
            'col_ids': ['one', 'two'],
            'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]]
        }
        return kinship

    def _compare(self, s, t):
        return Counter(s) == Counter(t)

    def pretend_download_staging_file(self, vcf_filename, scratch):
        vcf_filepath = os.path.join(scratch, vcf_filename)
        shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath)
        return {'copy_file_path': vcf_filepath}

    def _generate_population(self,
                             location_filepath,
                             genotypes,
                             population_description="None Provided"):
        locations = pd.read_csv(location_filepath, delimiter='\t')

        # Drop any missing data from id, latitude, or longitude.
        locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True)

        # Compare the location IDs with the genotype IDs
        if not (self._compare(locations.iloc[:, 0].astype(str).tolist(),
                              genotypes)):
            log("Location IDs do not match Sample IDs in Variation file!")
            raise ValueError(
                "Location IDs do not match Sample IDs in Variation file!")

        col_names = [x.lower() for x in locations.columns.values]
        expected_columns = ['id', 'latitude', 'longitude']
        optional_columns = ['elevation', 'description']

        # CHeck that first three columns match the expected columns.
        if not (self._compare(col_names[0:3], expected_columns)):
            raise ValueError("Missing or unexpected column names in {}".format(
                location_filepath))

        # If optional columns are not present, give default value for each.
        for col in optional_columns:
            if col not in col_names:
                if col == 'elevation':
                    locations[col] = 0.0
                else:
                    locations[col] = "None provided."

        population = {'description': population_description, 'strains': []}
        for idx, row in locations.iterrows():
            population['strains'].append({
                'source_id': str(row['id']),
                'location_info': {
                    'lat': row['latitude'],
                    'lon': row['longitude'],
                    'elevation': row['elevation'],
                    'description': row['description']
                }
            })

        return population

    def _validate_vcf(self, vcf_filepath, vcf_version):
        validation_output_dir = os.path.join(self.scratch,
                                             'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-o")
            validator_cmd.append(validation_output_dir)
        else:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version below 4.1.  No validation logging.")

        print("Validator command: {}".format(validator_cmd))
        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            validator_output.append(line)

        p.wait()

        validation_output_filename = [
            f for f in os.listdir(validation_output_dir) if f.endswith('.txt')
        ][0]
        validation_output_filepath = os.path.join(validation_output_dir,
                                                  validation_output_filename)

        if not validation_output_filename:
            print('Validator did not generate log file!')
            raise Exception("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filepath))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filepath, p.returncode

    # Retrieve contigs from assembly file.
    def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'):
        try:
            assembly_data = self.dfu.get_objects(
                {'object_refs': [assembly_ref]})['data'][0]['data']
        except Exception as e:
            print("Unable to retrieve Assembly reference: {}".format(
                assembly_ref))
            raise ValueError(e)
        raw_contigs = assembly_data['contigs']
        contigs = {}

        # Contigs returns just a dict with key and contig_id
        for key, value in raw_contigs.iteritems():
            contigs[str(key)] = value['contig_id']
        return raw_contigs

    def _get_version_contigs_genotypes(self, vcf_filepath):
        contigs = []
        genotypes = []
        version = ''
        with (gzip.open if vcf_filepath.endswith('.gz') else open)(
                vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted."
                    )
                raise ValueError(
                    "Invalid VCF.  ##fileformat line in meta is improperly formatted."
                )
            version = float(tokens[1][-4:].rstrip())
            log("VCF version: {}".format(version))
            for line in vcf:
                if line.startswith("#CHROM"):
                    log("#CHROM encountered, exiting loop.")
                    genotypes = line.split()[9:]
                    log("Number Genotypes in vcf: {}".format(len(genotypes)))
                    break
                tokens = line.split("=")

                if tokens[0].startswith('##contig'):
                    contigs.append(tokens[2][:-2])
        return version, contigs, genotypes

    # Arabidopsis ref: 18590/2/8
    def _get_assembly_ref_from_genome(self, genome_ref):
        ga = GenomeAnnotationAPI(self.service_wiz_url)
        inputs_get_assembly = {'ref': genome_ref}
        try:
            assembly_object_ref = ga.get_assembly(inputs_get_assembly)
        except Exception as e:
            print(
                "Unable to retrieve Assembly reference ID from Genome ref_id: {}"
                .format(genome_ref))
            raise Exception(e)

        return assembly_object_ref

    def _generate_output_file_list(self):
        log('Start packing result files')
        output_files = list()

        result_file = os.path.join(self.scratch,
                                   'variation_importer_results.zip')
        excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store']
        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(self.scratch):
                for file in files:
                    if not (file.endswith(tuple(excluded_extensions))
                            # file.endswith('.zip') or
                            # file.endswith('.vcf') or
                            # file.endswith('.vcf.gz') or
                            # file.endswith('.html') or
                            # file.endswith('.DS_Store')
                            ):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({
            'path':
            result_file,
            'name':
            os.path.basename(result_file),
            'label':
            os.path.basename(result_file),
            'description':
            'File(s) generated by Variation Importer'
        })
        log("Importer output generated: {}".format(output_files))

        return output_files

    def _generate_report(self, params, variation_results, variation_file_path):

        stats_results = self._generate_variation_stats(
            params['additional_output_type'], variation_file_path)

        html_report = self._generate_html_report(variation_results,
                                                 stats_results)

        file_links = self._generate_output_file_list()
        objects = []
        if (variation_results['valid_variation_file']):
            objects = [{
                'ref':
                variation_results['variation_obj_ref'],
                'description':
                'Variation Object created by VCF Importer'
            }]

        report_params = {
            'objects_created': objects,
            'message': '',
            'direct_html_link_index': 0,
            'file_links': file_links,
            'html_links': html_report,
            'html_window_height': 330,
            'workspace_name': params['workspace_name'],
            'report_object_name':
            'variation_importer_report_' + str(uuid.uuid4())
        }
        kbr_output = self.kbr.create_extended_report(report_params)
        report_output = {
            'report_name': kbr_output['name'],
            'report_ref': kbr_output['ref'],
            'variation_ref': variation_results['variation_obj_ref']
        }
        log("Returning from _generate_report!")
        return report_output

    def _generate_html_report(self, variation_results, stats_output=None):
        """
            _generate_html_report: generate html report from output files
        """
        html_report = list()
        print("Validation output filepath passed to html report: {}".format(
            variation_results['validation_output_filepath']))
        try:
            report_dir = os.path.join(self.scratch, 'html')
            os.mkdir(report_dir)

            with open(template_dir, 'r') as html, open(
                    variation_results['validation_output_filepath'],
                    'r') as validation:

                validation_content = '<p><h4>{} '.format(
                    variation_results['variation_filename'])
                if variation_results.get('valid_variation_file'):
                    validation_content += '<em><i>is</i> a valid </em> variation file.'
                else:
                    validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.'
                validation_content += '</h4></p>'

                report = html.read()

                # Discard the first line of the validation file.  It is irrelevant.
                validation.readline()

                validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>'
                validation_content += '<ul>'
                for line in validation.readlines():
                    validation_content += '<li>{}</li>'.format(line)
                validation_content += '</ul>'

                if variation_results.get('invalid_contigs'):
                    validation_content += '<h4>The following Contigs were not found in the reference genome.  The possible contigs have been written to the file {}.  Please see the associated links to download.</h4>'.format(
                        variation_results.get('genome_ref'),
                        'valid_contigs.txt')
                    validation_content += '<ul>'
                    for contig in variation_results.get('invalid_contigs'):
                        validation_content += '<li>{}</li>'.format(contig)
                    validation_content += '</ul>'

                # if not variation_results.get('contigs'):
                #     validation_content += '<h4>No contig information was included in the VCF file header!  Please recreate the VCF file with each contig described in the meta description </h4>'
                report = report.replace('Validation_Results',
                                        validation_content)

                if (stats_output.get('stats_file_dir')):
                    summary_results = '<p><h4>Summary Statistics</h4></p>'
                    summary_results += '''
                                        <table>
                                            <tr>
                                                <th>Number of SNPs</th>
                                                <th>Number of Genotypes </th>
                                            </tr>
                                        '''
                    summary_results += '<tr>'
                    summary_results += '<td>{}</td><td>{}</td>'.format(
                        'To be added later',
                        variation_results['num_genotypes'])
                    summary_results += '</tr></table>'
                    report = report.replace('Variation_Statistics',
                                            summary_results)

                # visualization
                image_content = ''
                if (stats_output.get('stats_img_dir')):
                    image_dir = stats_output.get('stats_img_dir')

                    for file in glob.glob(os.path.join(image_dir, '*.png')):
                        shutil.move(file, report_dir)

                    for image in glob.glob(report_dir + "/*.png"):
                        image = image.replace(report_dir + '/', '')
                        caption = image.replace(report_dir + '/',
                                                '').replace('.png', '')
                        image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                            '></a><a target="_blank"><br>' \
                            '<p align="center">{}</p></p>'.format(image, caption)

                else:
                    image_content += 'No visualizations generated.'

                report = report.replace("Visualization_Results", image_content)
        except Exception as e:
            print("Error generating HTML report.")
            raise

        report_file_path = os.path.join(report_dir, 'index.html')
        with open(report_file_path, 'w') as output:
            output.write(report)
        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': report_file_path,
                'make_handle': 0,
                'pack': 'zip'
            })
            log("Variation HTML report to shock ref: {}".format(
                html_upload_ret))
        except:
            raise ValueError('Error uploading HTML to shock')

        html_report.append({
            'shock_id': html_upload_ret['shock_id'],
            'name': os.path.basename(report_file_path),
            'label': os.path.basename(report_file_path),
            'description': 'HTML report for Variation Importer'
        })

        return html_report

    def _generate_variation_stats(self, additional_output_type,
                                  variation_filepath):
        """
            :param commments go here
        """
        file_output_directory = os.path.join(self.scratch,
                                             'stats_' + str(uuid.uuid4()))
        os.mkdir(file_output_directory)

        image_output_directory = os.path.join(
            self.scratch, 'stats_images_' + str(uuid.uuid4()))
        os.mkdir(image_output_directory)

        # TODO: Validate user supplied params and build PLINK command
        plink_cmd = ["plink"]
        plink_cmd.append('--vcf')
        plink_cmd.append(variation_filepath)

        # plink_cmd.append('--recode12')
        # plink_cmd.append('transpose')
        # plink_cmd.append('--output-missing-genotype')
        # plink_cmd.append("0")
        plink_cmd.append('--freq')
        plink_cmd.append('--hardy')
        # plink_cmd.append('gz')

        plink_cmd.append('--out')
        plink_cmd.append(variation_filepath)

        print("PLINK arguments: {}".format(plink_cmd))

        plink_output = {
            "errors": [],
            "warnings": []
            # "notes" : []
        }
        p = subprocess.Popen(plink_cmd,
                             cwd=file_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        while True:
            line = p.stdout.readline()
            if not line:
                break
            # log(line)
            tokens = line.split(':')
            if (tokens[0] == 'Error'):
                plink_output['errors'].append(line)
                raise ValueError('PLINK 1.9 error: ' + line)
            elif (tokens[0] == 'Warning'):
                plink_output['warnings'].append(line)
                print(line)
            # elif(tokens[0] == 'Note'):
            #     plink_output['notes'].append(line)
            #     print(line)

        p.stdout.close()
        p.wait()
        plink_output_filepath = os.path.join(file_output_directory,
                                             'plink_cli_output.txt')
        with open(plink_output_filepath, 'w') as plink:
            for data in plink_output:
                plink.write("{}: {}\n".format(data, plink_output[data]))

        plink_output_files = [
            f for f in os.listdir(self.scratch)
            if f.startswith(os.path.basename(variation_filepath) + '.')
        ]

        for file in plink_output_files:
            shutil.move(os.path.join(self.scratch, file),
                        file_output_directory)

        if p.returncode != 0:
            log("PLINK encountered an error during runtime.  Please see log file."
                )

        variation_filename = os.path.basename(variation_filepath)
        base_filepath = os.path.join(file_output_directory, variation_filename)
        freq_filepath = base_filepath + '.frq'

        maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R'
        hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R'
        log("Frequency filepath: {}".format(freq_filepath))
        # TODO: make function to do Rscript calls.
        # generate visualizations and store in directory
        maf_command = ['Rscript']
        maf_command.append('--no-save')
        maf_command.append('--vanilla')
        maf_command.append(maf_script_filepath)
        maf_command.append(freq_filepath)
        maf_command.append("Minor Allele Frequencies.png")
        r = subprocess.Popen(maf_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        r.wait()
        if r.returncode != 0:
            log("Error creating MAF histogram in R")

        hwe_filepath = base_filepath + '.hwe'
        zoom_filepath = hwe_filepath + '.zoom'
        zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format(
            hwe_filepath, zoom_filepath)
        try:
            z = subprocess.Popen(zoom_command,
                                 cwd=file_output_directory,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
            z.wait()

            if z.returncode != 0:
                log("Error creating HWE zoom file.")

        except Exception as e:
            log("Error creating zoom HWE file: {}".format(e))

        hwe_command = ['Rscript']
        hwe_command.append('--no-save')
        hwe_command.append('--vanilla')
        hwe_command.append(hwe_script_filepath)
        hwe_command.append(hwe_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium.png")
        hwe_command.append(zoom_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png")
        print("MAF command: {}".format(hwe_command))
        h = subprocess.Popen(hwe_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        h.wait()

        if h.returncode != 0:
            log("Error generating HWE Zoom plot")

        return {
            'stats_file_dir': file_output_directory,
            'stats_img_dir': image_output_directory
        }

    def _save_variation_to_ws(self, workspace_name, variation_obj,
                              variation_filepath, kinship_matrix):
        ws_id = self.dfu.ws_name_to_id(workspace_name)
        try:
            vcf_shock_return = self.dfu.file_to_shock({
                'file_path': variation_filepath,
                'make_handle': 1,
                'pack': 'gzip'
            })
        except Exception as e:
            print("Error uploading file to shock!")
            raise ValueError(e)

        variation_obj['variation_file_reference'] = vcf_shock_return.get(
            'shock_id')

        info = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_obj,
                'name': 'TestVariationImporterName'
            }]
        })[0]

        variation_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        log("Variation reference created: {}".format(variation_ref))
        return variation_ref

    def validate_vcf(self, params):
        """
            :param params: dict containing all input parameters.
        """

        returnVal = {}
        valid_vcf_file = True

        try:
            vcf_filepath = self.pretend_download_staging_file(
                params['staging_file_subdir_path'],
                self.scratch).get('copy_file_path')

            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['staging_file_subdir_path']))

        try:
            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['location_file_subdir_path']))

        # Check file size
        log("{} file size: {}".format(vcf_filepath,
                                      os.path.getsize(vcf_filepath)))
        log('\nValidating {}...'.format(vcf_filepath))

        vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes(
            vcf_filepath)

        if not vcf_contigs:
            log("No contig data in {} header.".format(vcf_filepath))
            raise ValueError(
                "No contig data in {} header.".format(vcf_filepath))

        if (vcf_version < 4.1):
            log("VCF file is version {}.  Must be at least version 4.1".format(
                vcf_version))
            raise ValueError(
                "VCF file is version {}.  Must be at least version 4.1".format(
                    vcf_version))

        # Generate population object
        population = self._generate_population(location_filepath,
                                               vcf_genotypes)

        # Retrieve Assembly object reference associated with genome.
        try:
            assembly_ref = self._get_assembly_ref_from_genome(
                params['genome_ref'])
        except Exception as e:
            print("Unable to retrieve {}".format(params['genome_ref']))
            raise ValueError(e)

        # Retrieve contig list from Assembly object.
        try:
            assembly_contigs = self._get_contigs_from_assembly(assembly_ref)
        except Exception as e:
            print("Unable to retrieve contigs from Assembly ref: {}".format(
                assembly_ref))
            raise ValueError(e)

        log("Length of assembly contigs: {}".format(len(assembly_contigs)))
        # Compare contig IDs from VCF to those in the Assembly object
        invalid_contigs = []
        for contig in vcf_contigs:
            if contig not in assembly_contigs.keys():
                invalid_contigs.append(contig)

        if invalid_contigs:
            log("Invalid contig IDs found in {}".format(vcf_filepath))
            valid_contig_filepath = os.path.join(self.scratch,
                                                 'valid_contigs.txt')
            log("Writing valid contigs to file: {}".format(
                valid_contig_filepath))
            with open(valid_contig_filepath, 'w') as icf:
                for contig in assembly_contigs:
                    icf.write(contig + '\n')
            valid_vcf_file = False

        validation_output_filepath, returncode = self._validate_vcf(
            vcf_filepath, vcf_version)

        if returncode != 0:
            valid_vcf_file = False

        kinship_matrix = self._create_fake_kinship_matrix()

        variation_obj_ref = ''
        if valid_vcf_file:
            variation_object = {
                "genome": params['genome_ref'],
                "population": population,
                "contigs": vcf_contigs,
                "comment": "Comments go here",
                "assay": "Assay data goes gere.",
                "originator": "PI/Lab info goes here",
                "pubmed_id": "PubMed ID goes here",
                "kinship_info": kinship_matrix
            }

            variation_obj_ref = self._save_variation_to_ws(
                params['workspace_name'], variation_object, vcf_filepath,
                kinship_matrix)

        log("Variation object reference: {}".format(variation_obj_ref))
        variation_report_metadata = {
            'valid_variation_file': valid_vcf_file,
            'variation_obj_ref': variation_obj_ref,
            'variation_filename': os.path.basename(vcf_filepath),
            'validation_output_filepath': validation_output_filepath,
            'vcf_version': vcf_version,
            'num_genotypes': len(vcf_genotypes),
            'num_contigs': len(vcf_contigs),
            'invalid_contigs': invalid_contigs
        }

        returnVal = self._generate_report(params, variation_report_metadata,
                                          vcf_filepath)

        return returnVal
Exemplo n.º 12
0
class GenomeInterface:
    def _validate_save_one_genome_params(self, params):
        """
        _validate_save_one_genome_params:
                validates params passed to save_one_genome method
        """

        log('start validating save_one_genome params')

        # check for required parameters
        for p in ['workspace', 'name', 'data']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _check_shock_response(self, response, errtxt):
        """
        _check_shock_response: check shock node response (Copied from DataFileUtil)
        """
        log('start checking shock response')

        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except:
                # this means shock is down or not responding.
                self.log("Couldn't parse response error content from Shock: " +
                         response.content)
                response.raise_for_status()
            raise ValueError(errtxt + str(err))

    def _own_handle(self, genome_data, handle_property):
        """
        _own_handle: check that handle_property point to shock nodes owned by calling user
        """

        log('start checking handle {} ownership'.format(handle_property))

        if handle_property in genome_data:
            handle_id = genome_data[handle_property]
            hs = HandleService(self.handle_url, token=self.token)
            handles = hs.hids_to_handles([handle_id])
            shock_id = handles[0]['id']

            # Copy from DataFileUtil.own_shock_node implementation:
            header = {'Authorization': 'Oauth {}'.format(self.token)}
            res = requests.get(self.shock_url + '/node/' + shock_id +
                               '/acl/?verbosity=full',
                               headers=header,
                               allow_redirects=True)
            self._check_shock_response(
                res, 'Error getting ACLs for Shock node {}: '.format(shock_id))
            owner = res.json()['data']['owner']['username']
            user_id = self.auth_client.get_user(self.token)

            if owner != user_id:
                log('start copying node to owner: {}'.format(user_id))
                dfu_shock = self.dfu.copy_shock_node({
                    'shock_id': shock_id,
                    'make_handle': True
                })
                handle_id = dfu_shock['handle']['hid']
                genome_data[handle_property] = handle_id

    def _check_dna_sequence_in_features(self, genome):
        """
        _check_dna_sequence_in_features: check dna sequence in each feature
        """
        log('start checking dna sequence in each feature')

        if 'features' in genome:
            features_to_work = {}
            for feature in genome['features']:
                if not ('dna_sequence' in feature and feature['dna_sequence']):
                    features_to_work[feature['id']] = feature['location']

            if len(features_to_work) > 0:
                aseq = AssemblySequenceAPI(self.sw_url, token=self.token)
                get_dna_params = {'requested_features': features_to_work}
                if 'assembly_ref' in genome:
                    get_dna_params['assembly_ref'] = genome['assembly_ref']
                elif 'contigset_ref' in genome:
                    get_dna_params['contigset_ref'] = genome['contigset_ref']
                else:
                    # Nothing to do (it may be test genome without contigs)...
                    return
                dna_sequences = aseq.get_dna_sequences(
                    get_dna_params)['dna_sequences']
                for feature in genome['features']:
                    if feature['id'] in dna_sequences:
                        feature['dna_sequence'] = dna_sequences[feature['id']]
                        feature['dna_sequence_length'] = len(
                            feature['dna_sequence'])

    def __init__(self, config):
        self.ws_url = config.workspaceURL
        self.handle_url = config.handleURL
        self.shock_url = config.shockURL
        self.sw_url = config.srvWizURL
        self.token = config.token
        self.auth_service_url = config.authServiceUrl
        self.callback_url = config.callbackURL

        self.ws = Workspace(self.ws_url, token=self.token)
        self.auth_client = _KBaseAuth(self.auth_service_url)
        self.dfu = DataFileUtil(self.callback_url)

    def save_one_genome(self, params):
        log('start saving genome object')

        self._validate_save_one_genome_params(params)

        workspace = params['workspace']
        name = params['name']
        data = params['data']

        # check all handles point to shock nodes owned by calling user
        self._own_handle(data, 'genbank_handle_ref')
        self._own_handle(data, 'gff_handle_ref')

        self._check_dna_sequence_in_features(data)

        if 'hidden' in params and str(
                params['hidden']).lower() in ('yes', 'true', 't', '1'):
            hidden = 1
        else:
            hidden = 0

        if isinstance(workspace, int) or workspace.isdigit():
            workspace_id = workspace
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace)

        dfu_save_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': name,
                'hidden': hidden
            }]
        }

        dfu_oi = self.dfu.save_objects(dfu_save_params)[0]

        returnVal = {'info': dfu_oi}

        return returnVal
Exemplo n.º 13
0
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
Exemplo n.º 14
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.dfu = DataFileUtil(self.cfg.callbackURL)

    def import_file(self, params):

        # 1) validate parameters
        self._validate_import_file_params(params)

        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)

        # 3) extract out the parameters
        params = self._set_parsed_params(params)

        # 4) do the upload
        result = self.upload_genome(
            shock_service_url=self.cfg.shockURL,
            handle_service_url=self.cfg.handleURL,
            workspace_service_url=self.cfg.workspaceURL,
            callback_url=self.cfg.callbackURL,
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            taxon_wsname=params['taxon_wsname'],
            taxon_reference=params['taxon_reference'],
            source=params['source'],
            genome_type=params['type'],
            release=params['release'])

        # 5) generate report
        output_data_ref = params['workspace_name'] + "/" + params['genome_name']
        reportObj = {
            'objects_created': [{
                'ref': output_data_ref,
                'description': 'KBase Genome object'
            }],
            'text_message':
            result['report_string']
        }

        reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL'])
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        # 6) clear the temp directory
        shutil.rmtree(input_directory)

        # 7) return the result
        info = result['genome_info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return details

    def upload_genome(self,
                      shock_service_url=None,
                      handle_service_url=None,
                      workspace_service_url=None,
                      callback_url=None,
                      input_gff_file=None,
                      input_fasta_file=None,
                      workspace_name=None,
                      core_genome_name=None,
                      scientific_name="unknown_taxon",
                      taxon_wsname='ReferenceTaxons',
                      taxon_reference=None,
                      source=None,
                      release=None,
                      genome_type=None):

        # retrieve taxon
        taxonomy, taxon_reference = self._retrieve_taxon(
            taxon_reference, taxon_wsname, scientific_name)

        # reading in Fasta file
        assembly = self._retrieve_fasta_file(input_fasta_file,
                                             core_genome_name, scientific_name,
                                             source)

        if taxon_reference is not None:
            assembly["taxon_ref"] = taxon_reference

        # reading in GFF file
        feature_list = self._retrieve_gff_file(input_gff_file)

        # compile links between features
        feature_hierarchy = self._generate_feature_hierarchy(feature_list)

        # retrieve genome feature list
        (genome_features_list, genome_mrnas_list,
         genome_cdss_list) = self._retrieve_genome_feature_list(
             feature_list, feature_hierarchy, assembly)

        # remove sequences before loading
        for contig in assembly["contigs"]:
            del assembly["contigs"][contig]["sequence"]

        aUtil = AssemblyUtil(callback_url)
        assembly_ref = aUtil.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file,
                'assembly_name': assembly['assembly_id']
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            assembly['assembly_id']
        })

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, genome_features_list,
                                       genome_cdss_list, genome_mrnas_list,
                                       source, assembly, taxon_reference,
                                       taxonomy, input_gff_file)

        workspace_id = self.dfu.ws_name_to_id(workspace_name)
        genome_info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "name": core_genome_name,
                "type": "KBaseGenomes.Genome",
                "data": genome
            }]
        })[0]
        report_string = ''

        return {'genome_info': genome_info, 'report_string': report_string}

    def _validate_import_file_params(self, params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(file.keys())
                raise ValueError(error_msg)

        # check for valid type param
        valid_types = ['Reference', 'User upload', 'Representative']
        if params.get('type') and params['type'] not in valid_types:
            error_msg = 'Entered value for type is not one of the valid entries of '
            error_msg += '[' + ''.join('"' + str(e) + '", '
                                       for e in valid_types)[0:-2] + ']'
            raise ValueError(error_msg)

    def _set_parsed_params(self, params):
        log('Setting params')

        # default params
        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'type': 'User upload',
            'metadata': {}
        }

        for field in default_params:
            if field not in params:
                params[field] = default_params[field]

        log(json.dumps(params, indent=1))

        return params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                print("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name):
        """
        _retrieve_taxon: retrieve taxonomy and taxon_reference

        """
        taxon_id = -1
        taxon_object_name = "unknown_taxon"

        # retrieve lookup object if scientific name provided
        if (taxon_reference is None
                and scientific_name is not "unknown_taxon"):
            # retrieve taxon lookup object then find taxon id
            taxon_lookup = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/taxon_lookup"],
                'ignore_errors':
                0
            })['data'][0]['data']['taxon_lookup']

            if (scientific_name[0:3] in taxon_lookup
                    and scientific_name in taxon_lookup[scientific_name[0:3]]):
                taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name]
                taxon_object_name = "{}_taxon".format(str(taxon_id))

        # retrieve Taxon object
        taxon_info = {}
        if (taxon_reference is None):
            taxon_info = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/" + taxon_object_name],
                'ignore_errors':
                0
            })['data'][0]
            taxon_reference = "{}/{}/{}".format(taxon_info['info'][6],
                                                taxon_info['info'][0],
                                                taxon_info['info'][4])
        else:
            taxon_info = self.dfu.get_objects({
                "object_refs": [taxon_reference],
                'ignore_errors': 0
            })['data'][0]

        taxonomy = taxon_info['data']['scientific_lineage']

        return taxonomy, taxon_reference

    def _retrieve_fasta_file(self, input_fasta_file, core_genome_name,
                             scientific_name, source):
        """
        _retrieve_fasta_file: retrieve info from fasta_file
                              https://www.biostars.org/p/710/

        """
        log("Reading FASTA file")

        assembly = {
            "contigs": {},
            "dna_size": 0,
            "gc_content": 0,
            "md5": [],
            "base_counts": {}
        }
        contig_seq_start = 0

        input_file_handle = open(input_fasta_file, 'rb')

        # alternate header and sequence
        faiter = (x[1] for x in itertools.groupby(input_file_handle,
                                                  lambda line: line[0] == ">"))
        for header in faiter:
            # drop the ">"
            header = header.next()[1:].strip()
            # join all sequence lines to one.
            seq = "".join(s.strip() for s in faiter.next())

            try:
                fasta_header, fasta_description = header.split(' ', 1)
            except:
                fasta_header = header
                fasta_description = None

            # Handle record
            seq = seq.upper()

            # Build contig objects for Assembly
            seq_count = dict(collections.Counter(seq))

            # to delete at end, but required for now
            contig_dict = {"sequence": seq}

            Ncount = 0
            if "N" in seq_count:
                Ncount = seq_count["N"]
            contig_dict["Ncount"] = Ncount

            for character in seq_count:
                if character in assembly["base_counts"]:
                    assembly["base_counts"][character] += seq_count[character]
                else:
                    assembly["base_counts"][character] = seq_count[character]

            contig_seq_length = len(seq)
            assembly["dna_size"] += contig_seq_length
            contig_gc_length = seq.count("G")
            contig_gc_length += seq.count("C")
            contig_dict["gc_content"] = float("{0:.2f}".format(
                float(contig_gc_length) / float(contig_seq_length)))
            assembly["gc_content"] += contig_gc_length
            contig_dict["contig_id"] = fasta_header
            contig_dict["name"] = fasta_header
            contig_dict["length"] = contig_seq_length
            contig_dict["md5"] = hashlib.md5(seq).hexdigest()
            assembly["md5"].append(contig_dict["md5"])

            if fasta_description is not None:
                contig_dict["description"] = fasta_description

            contig_dict["is_circular"] = "Unknown"
            contig_dict["start_position"] = contig_seq_start
            contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"])
            assembly["contigs"][fasta_header] = contig_dict

            # used for start of next sequence and total gc_content
            contig_seq_start += contig_seq_length

        assembly["gc_content"] = float("{0:.2f}".format(
            float(assembly["gc_content"]) / float(contig_seq_start)))
        assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest()
        assembly["assembly_id"] = core_genome_name + "_assembly"
        assembly["name"] = scientific_name
        assembly["external_source"] = source
        assembly["external_source_id"] = os.path.basename(input_fasta_file)
        assembly["external_source_origination_date"] = str(
            os.stat(input_fasta_file).st_ctime)
        assembly["num_contigs"] = len(assembly["contigs"].keys())
        assembly["type"] = "Unknown"
        assembly[
            "notes"] = "Note MD5s are generated from uppercasing the sequences"

        return assembly

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = dict()
        is_phytozome = 0
        is_patric = 0

        gff_file_handle = open(input_gff_file, 'rb')
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if ("phytozome" in source_id or "Phytozome" in source_id):
                    is_phytozome = 1

                #Checking to see if Phytozome
                if ("PATRIC" in source_id):
                    is_patric = 1

                #PATRIC prepends their contig ids with some gibberish
                if (is_patric and "|" in contig_id):
                    contig_id = contig_id.split("|", 1)[1]

                #Features grouped by contigs first
                if (contig_id not in feature_list):
                    feature_list[contig_id] = list()

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': attributes
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    ftr[key] = value

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if (is_phytozome):
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline
        if (is_phytozome):
            self._print_phytozome_gff(input_gff_file, feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):

        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list.keys():
            for i in range(len(feature_list[contig])):
                if ("ID" not in feature_list[contig][i]):
                    for key in ("transcriptId", "proteinId", "PACid", "pacid",
                                "Parent"):
                        if (key in feature_list[contig][i]):
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i][key]
                            break

                    #If the process fails, throw an error
                    for ftr_type in ("gene", "mRNA", "CDS"):
                        if (ftr_type not in feature_list[contig][i]):
                            continue

                        if ("ID" not in feature_list[contig][i]):
                            log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \
                                    feature_list[contig][i]['contig']+"."+ \
                                    feature_list[contig][i]['source']+"."+ \
                                    feature_list[contig][i]['type']+": "+ \
                                    feature_list[contig][i]['attributes'])
        return feature_list

    def _generate_feature_hierarchy(self, feature_list):

        feature_hierarchy = {contig: {} for contig in feature_list}

        #Need to remember mRNA/gene links for CDSs
        mRNA_gene_dict = {}
        exon_list_position_dict = {}

        for contig in feature_list:
            for i in range(len(feature_list[contig])):
                ftr = feature_list[contig][i]

                if ("gene" in ftr["type"]):
                    feature_hierarchy[contig][ftr["ID"]] = {
                        "utrs": [],
                        "mrnas": [],
                        "cdss": [],
                        "index": i
                    }

                if ("UTR" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[
                        ftr["Parent"]]]["utrs"].append({
                            "id": ftr["ID"],
                            "index": i
                        })

                if ("RNA" in ftr["type"]):
                    feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({
                        "id":
                        ftr["ID"],
                        "index":
                        i,
                        "cdss": []
                    })
                    mRNA_gene_dict[ftr["ID"]] = ftr["Parent"]
                    exon_list_position_dict[ftr["ID"]] = len(
                        feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1

                if ("CDS" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\
                        [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } )

        return feature_hierarchy

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list.keys():
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    def _update_phytozome_features(self, feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list.keys():
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("ID", "PACid", "pacid"):
                    if (key in feature_list[contig][i]):
                        old_id = feature_list[contig][i][key]
                        break
                if (old_id is None):
                    #This should be an error
                    print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\
                               feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("Name" in feature_list[contig][i]):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "Name"]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended
        #4) CDS appended with an incremented digit

        CDS_count_dict = dict()
        mRNA_parent_dict = dict()

        for contig in feature_list.keys():
            for ftr in feature_list[contig]:
                if ("Parent" in ftr):

                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

                    if ("CDS" in ftr["type"]):
                        #Increment CDS identifier
                        if (ftr["ID"] not in CDS_count_dict):
                            CDS_count_dict[ftr["ID"]] = 1
                        else:
                            CDS_count_dict[ftr["ID"]] += 1
                        ftr["ID"] = ftr["ID"] + "." + str(
                            CDS_count_dict[ftr["ID"]])

                        #Recall new mRNA id for parent
                        ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]]

        return feature_list

    def _print_phytozome_gff(self, input_gff_file, feature_list):

        #Write modified feature ids to new file
        input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz"
        try:
            print "Printing to new file: " + input_gff_file
            gff_file_handle = gzip.open(input_gff_file, 'wb')
        except:
            print "Failed to open"

        for contig in sorted(feature_list.iterkeys()):
            for ftr in feature_list[contig]:

                #Re-build attributes
                attributes_dict = {}
                for attribute in ftr["attributes"].split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    if (ftr[key] != value):
                        value = ftr[key]
                    attributes_dict[key] = value

                ftr["attributes"] = ";".join(key + "=" + attributes_dict[key]
                                             for key in attributes_dict.keys())

                new_line = "\t".join(
                    str(ftr[key]) for key in [
                        'contig', 'source', 'type', 'start', 'end', 'score',
                        'strand', 'phase', 'attributes'
                    ])
                gff_file_handle.write(new_line)
        gff_file_handle.close()
        return

    def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy,
                                      assembly):

        genome_features_list = list()
        genome_mrnas_list = list()
        genome_cdss_list = list()
        genome_translation_issues = list()

        for contig in feature_hierarchy:
            for gene in feature_hierarchy[contig]:

                #We only iterate through the gene objects
                #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly

                ftr = feature_list[contig][feature_hierarchy[contig][gene]
                                           ["index"]]
                contig_sequence = assembly["contigs"][
                    ftr["contig"]]["sequence"]
                gene_ftr = self._convert_ftr_object(
                    ftr, contig_sequence
                )  #reverse-complementation for negative strands done here

                #Add non-optional terms
                gene_ftr["mrnas"] = list()
                gene_ftr["cdss"] = list()
                gene_ftr["ontology_terms"] = dict()

                #Retaining longest sequences for gene feature
                longest_protein_length = 0
                longest_protein_sequence = ""
                for mRNA in feature_hierarchy[contig][gene]["mrnas"]:

                    ########################################################
                    # Construct mRNA Ftr
                    ########################################################
                    ftr = feature_list[contig][mRNA["index"]]
                    contig_sequence = assembly["contigs"][
                        ftr["contig"]]["sequence"]
                    mRNA_ftr = self._convert_ftr_object(
                        ftr, contig_sequence
                    )  #reverse-complementation for negative strands done here

                    #Modify mrna object for use in mrna array
                    #Objects will be un-used until further notice
                    mRNA_ftr['parent_gene'] = gene_ftr['id']

                    #If there are CDS, then New CDS ID without incrementation as they were aggregated
                    if (len(mRNA['cdss']) > 0):
                        mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS"
                    else:
                        mRNA_ftr['cds'] = ""

                    #Add to mrnas array
                    genome_mrnas_list.append(mRNA_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["mrnas"].append(mRNA_ftr["id"])

                    ########################################################
                    # Construct transcript, protein sequence, UTR, CDS locations
                    ########################################################

                    #At time of writing, all of this aggregation should probably be done in a single function
                    cds_exons_locations_array = list()
                    cds_cdna_sequence = str()
                    protein_sequence = str()
                    if (len(mRNA["cdss"]) > 0):
                        (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \
                            self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues)

                    UTRs = list()
                    if ("utrs" in feature_hierarchy[contig][gene] and
                            len(feature_hierarchy[contig][gene]["utrs"]) > 0):
                        for UTR in feature_hierarchy[contig][gene]["utrs"]:
                            ftr = feature_list[contig][UTR["index"]]
                            if ("Parent" in ftr
                                    and ftr["Parent"] == mRNA_ftr["id"]):
                                UTRs.append(ftr)

                    mrna_exons_locations_array = copy.deepcopy(
                        cds_exons_locations_array)
                    mrna_transcript_sequence = str(cds_cdna_sequence)
                    if (len(UTRs) > 0):
                        (mrna_exons_locations_array, mrna_transcript_sequence) = \
                            self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence)

                    #Update sequence and locations
                    mRNA_ftr["dna_sequence"] = mrna_transcript_sequence
                    mRNA_ftr["dna_sequence_length"] = len(
                        mrna_transcript_sequence)
                    mRNA_ftr["location"] = mrna_exons_locations_array
                    mRNA_ftr["md5"] = hashlib.md5(
                        mRNA_ftr["dna_sequence"]).hexdigest()

                    #Remove DNA
                    del mRNA_ftr["dna_sequence"]
                    del mRNA_ftr["dna_sequence_length"]

                    #Skip CDS if not present
                    if (len(mRNA["cdss"]) == 0):
                        continue

                    #Remove asterix representing stop codon if present
                    if (len(protein_sequence) > 0
                            and protein_sequence[-1] == '*'):
                        protein_sequence = protein_sequence[:-1]

                    #Save longest sequence
                    if (len(protein_sequence) > longest_protein_length):
                        longest_protein_length = len(protein_sequence)
                        longest_protein_sequence = protein_sequence

                    ########################################################
                    # Construct CDS Ftr
                    ########################################################
                    CDS_ftr = dict()
                    CDS_ftr['type'] = 'CDS'

                    #New CDS ID without incrementation as they were aggregated
                    CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS'

                    #Add gene/mrna links
                    CDS_ftr['parent_gene'] = gene_ftr['id']
                    CDS_ftr['parent_mrna'] = mRNA_ftr['id']

                    #Update sequence and locations
                    CDS_ftr["dna_sequence"] = cds_cdna_sequence
                    CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence)
                    CDS_ftr["location"] = cds_exons_locations_array
                    CDS_ftr["md5"] = hashlib.md5(
                        CDS_ftr["dna_sequence"]).hexdigest()

                    #Add protein
                    CDS_ftr["protein_translation"] = str(
                        protein_sequence).upper()
                    CDS_ftr["protein_translation_length"] = len(
                        CDS_ftr["protein_translation"])
                    #Only generate md5 for dna sequences
                    #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest()

                    #Add empty non-optional fields for populating in future
                    CDS_ftr["ontology_terms"] = dict()
                    if ("aliases" not in CDS_ftr):
                        CDS_ftr["aliases"] = list()
                    if ("function" not in CDS_ftr):
                        CDS_ftr["function"] = ""

                    #Add to cdss array
                    genome_cdss_list.append(CDS_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["cdss"].append(CDS_ftr["id"])

                gene_ftr["protein_translation"] = longest_protein_sequence
                gene_ftr["protein_translation_length"] = longest_protein_length
                genome_features_list.append(gene_ftr)

        msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format(
            len(genome_features_list), len(genome_mrnas_list),
            len(genome_cdss_list))
        msg += "{} mRNA(s) had errors during translation".format(
            len(genome_translation_issues))
        log(msg)

        return genome_features_list, genome_mrnas_list, genome_cdss_list

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         genome_features_list, genome_cdss_list,
                         genome_mrnas_list, source, assembly, taxon_reference,
                         taxonomy, input_gff_file):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome["features"] = genome_features_list
        genome["cdss"] = genome_cdss_list
        genome["mrnas"] = genome_mrnas_list
        genome["source"] = source
        genome["domain"] = "Eukaryota"
        genome["genetic_code"] = 1
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]

        if taxon_reference is not None:
            genome["taxon_ref"] = taxon_reference
            genome["taxonomy"] = taxonomy

        gff_file_to_shock = self.dfu.file_to_shock({
            'file_path': input_gff_file,
            'make_handle': 1,
            'pack': "gzip"
        })
        gff_handle_ref = gff_file_to_shock['handle']['hid']

        genome['gff_handle_ref'] = gff_handle_ref

        return genome

    def _convert_ftr_object(self, old_ftr, contig):
        new_ftr = dict()
        new_ftr["id"] = old_ftr["ID"]

        dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]],
                           IUPAC.ambiguous_dna)

        # reverse complement
        if (old_ftr["strand"] == "-"):
            dna_sequence = dna_sequence.reverse_complement()
            old_start = old_ftr["start"]
            old_ftr["start"] = old_ftr["end"]
            old_ftr["end"] = old_start

        new_ftr["dna_sequence"] = str(dna_sequence).upper()
        new_ftr["dna_sequence_length"] = len(dna_sequence)
        new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest()
        new_ftr["location"] = [[
            old_ftr["contig"], old_ftr["start"], old_ftr["strand"],
            len(dna_sequence)
        ]]
        new_ftr["type"] = old_ftr["type"]

        new_ftr["aliases"] = list()
        for key in ("transcriptId", "proteinId", "PACid", "pacid"):
            if (key in old_ftr.keys()):
                new_ftr["aliases"].append(key + ":" + old_ftr[key])

        return new_ftr

    def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence):

        #create copies of locations and transcript
        utrs_exons = list(exons)
        utr_exon_sequence = exon_sequence

        five_prime_dna_sequence = ""
        three_prime_dna_sequence = ""
        five_prime_locations = list()
        three_prime_locations = list()

        for UTR in (utr_list):
            contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"]
            UTR_ftr = self._convert_ftr_object(
                UTR, contig_sequence
            )  #reverse-complementation for negative strands done here

            #aggregate sequences and locations
            if ("five_prime" in UTR_ftr["id"]):
                five_prime_dna_sequence += UTR_ftr["dna_sequence"]
                five_prime_locations.append(UTR_ftr["location"][0])
            if ("three_prime" in UTR_ftr["id"]):
                three_prime_dna_sequence += UTR_ftr["dna_sequence"]
                three_prime_locations.append(UTR_ftr["location"][0])

        #Handle five_prime UTRs
        if (len(five_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file)
            five_prime_locations = sorted(five_prime_locations,
                                          key=lambda x: x[1])

            #Merge last UTR with CDS if "next" to each other
            if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \
                ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ):

                #Remove last UTR
                last_five_prime_location = five_prime_locations[-1]
                five_prime_locations = five_prime_locations[:-1]

                #"Add" last UTR to first exon
                utrs_exons[0][1] = last_five_prime_location[1]
                utrs_exons[0][3] += last_five_prime_location[3]

            #Prepend other UTRs if available
            if (len(five_prime_locations) > 0):
                utrs_exons = five_prime_locations + utrs_exons

        utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence

        #Handle three_prime UTRs
        if (len(three_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file
            three_prime_locations = sorted(three_prime_locations,
                                           key=lambda x: x[1])

            #Merge first UTR with CDS if "next to each other
            if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \
                ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ):

                #Remove first UTR
                first_three_prime_location = three_prime_locations[0]
                three_prime_locations = three_prime_locations[1:]

                #"Add" first UTR to last exon
                utrs_exons[-1][3] += first_three_prime_location[3]

        #Append other UTRs if available
        if (len(three_prime_locations) > 0):
            utrs_exons = utrs_exons + three_prime_locations

        utr_exon_sequence += three_prime_dna_sequence

        return (utrs_exons, utr_exon_sequence)

    def _cds_aggregation_translation(self, cds_list, feature_list, assembly,
                                     issues):

        dna_sequence = ""
        locations = list()

        # collect phases, and lengths of exons
        # right now, this is only for the purpose of error reporting
        phases = list()
        exons = list()

        #Saving parent mRNA identifier
        Parent_mRNA = cds_list[0]["id"]
        for CDS in (cds_list):
            ftr = feature_list[CDS["index"]]
            phases.append(ftr["phase"])
            Parent_mRNA = ftr["Parent"]

            contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"]
            CDS_ftr = self._convert_ftr_object(
                ftr, contig_sequence
            )  #reverse-complementation for negative strands done here
            exons.append(len(CDS_ftr["dna_sequence"]))

            # Remove base(s) according to phase, but only for first CDS
            if (CDS == cds_list[0] and int(ftr["phase"]) != 0):
                log("Adjusting phase for first CDS: " + CDS["id"])
                CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][
                    int(ftr["phase"]):]

            #aggregate sequences and locations
            dna_sequence += CDS_ftr["dna_sequence"]
            locations.append(CDS_ftr["location"][0])

        # translate sequence
        dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna)
        rna_sequence = dna_sequence_obj.transcribe()

        # incomplete gene model with no start codon
        if str(rna_sequence.upper())[:3] not in codon_table.start_codons:
            msg = "Missing start codon for {}. Possibly incomplete gene model.".format(
                Parent_mRNA)
            log(msg)

        # You should never have this problem, needs to be reported rather than "fixed"
        codon_count = len(str(rna_sequence)) % 3
        if codon_count != 0:
            msg = "Number of bases for RNA sequence for {} ".format(
                Parent_mRNA)
            msg += "is not divisible by 3. "
            msg += "The resulting protein may well be mis-translated."
            log(msg)
            issues.append(Parent_mRNA)

        protein_sequence = Seq("")
        try:
            protein_sequence = rna_sequence.translate()
        except CodonTable.TranslationError as te:
            log("TranslationError for: " + feature_object["id"], phases, exons,
                " : " + str(te))

        return (locations, dna_sequence.upper(), str(protein_sequence).upper())
Exemplo n.º 15
0
class Utils:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.gen_api = GenericsAPI(self.callback_url)
        self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom"
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def get_conditions(self, params):
        data = self.dfu.get_objects(
            {'object_refs': [params['condition_set_ref']]})['data'][0]['data']
        conditions = {}
        keep_keys = params.get('conditions', data['conditions'].keys())
        for key in keep_keys:
            conditions[key] = defaultdict(list)
            for factor, val in zip(data['factors'], data['conditions'][key]):
                ont_abriv = factor['factor_ont_id'].split(":")[0]
                factor['value'] = val
                conditions[key][ont_abriv].append(copy.copy(factor))
        return {"conditions": conditions}

    def file_to_condition_set(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep="\t", dtype='str')
        comp_set = self._df_to_cs_obj(df)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.ConditionSet",
                "data": comp_set,
                "name": params['output_obj_name']
            }]
        })[0]
        return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])}

    def _conditionset_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        factors = pd.DataFrame(data['factors'])
        factors.rename(columns=lambda x: x.replace("ont", "ontology").
                       capitalize().replace("_", " "))
        conditions = pd.DataFrame(data['conditions'])
        cs_df = factors.join(conditions)

        return cs_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.gen_api.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            cluster.get('id_to_data_position').keys() for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on condition
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = cluster.get('id_to_data_position').keys()
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a dataframe"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.ConditionSet" in obj_type:
            cs_df = self._conditionset_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _df_to_cs_obj(self, cs_df):
        """Converts a dataframe from a user file to a compound set object"""
        condition_set = {'ontology_mapping_method': "User Curation"}
        cs_df.fillna('', inplace=True)
        if not len(cs_df):
            raise ValueError("No factors in supplied files")
        factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor")
        condition_df = cs_df.drop(factor_df.columns, axis=1)
        if not len(condition_df.columns):
            raise ValueError(
                "Unable to find any condition columns in supplied file")

        factor_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "factor" not in factor_df.columns:
            raise ValueError(
                "Unable to find a 'Factor' column in supplied file")
        factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id')
        factors = factor_df.filter(items=factor_fields).to_dict('records')

        condition_set['factors'] = [
            self._add_ontology_info(f) for f in factors
        ]
        condition_set['conditions'] = condition_df.to_dict('list')
        return condition_set

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, factor):
        """Searches KBASE ontologies for terms matching the user supplied factors and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        factor = {
            k: v
            for k, v in factor.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            factor.get('factor_ont_id', "").replace("_", ":"))
        if ont_info:
            factor['factor_ont_ref'] = ont_info['ontology_ref']
            factor['factor_ont_id'] = ont_info['id']
        else:
            factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
            factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID

        if factor.get('unit'):
            ont_info = self._search_ontologies(
                factor.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                factor['unit_ont_ref'] = ont_info['ontology_ref']
                factor['unit_ont_id'] = ont_info['id']
            else:
                factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
                factor['unit_ont_id'] = self.DEFAULT_UNIT_ID
        return factor

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.ConditionSet" in obj_type:
            df.to_excel(writer, "Conditions", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
Exemplo n.º 16
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file
        """
        path, file = os.path.split(bam_file)
        return self.samtools.get_stats(file, path)

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment  *
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        uuid_prefix = uuid_str[:8]
        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get('downloadBAI', False):
                bai_file = uuid_prefix + '_' + file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get('downloadSAM', False):
                sam_file = uuid_prefix + '_' + file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref - 
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.iteritems():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 17
0
def upload_genome(shock_service_url=None,
                  handle_service_url=None,
                  workspace_service_url=None,
                  callback_url=None,
                  input_gff_file=None,
                  input_fasta_file=None,
                  workspace_name=None,
                  core_genome_name=None,
                  scientific_name="unknown_taxon",
                  taxon_wsname='ReferenceTaxons',
                  taxon_reference=None,
                  source=None,
                  release=None,
                  genome_type=None):

    assembly_ref = None
    gff_handle_ref = None
    time_string = str(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y_%m_%d_%H_%M_%S'))

    dfUtil = DataFileUtil(callback_url)

    ###########################################
    #Retrieve taxon
    #Taxon lookup dependent on full genus
    #Example: Athaliana    Arabidopsis thaliana
    ###########################################
    #default to
    taxon_id = -1
    taxon_object_name = "unknown_taxon"

    #Retrieve lookup object if scientific name provided
    if (taxon_reference is None and scientific_name is not "unknown_taxon"):
        #Need to retrieve taxon lookup object then find taxon id
        taxon_lookup = dfUtil.get_objects({
            'object_refs': [taxon_wsname + "/taxon_lookup"],
            'ignore_errors':
            0
        })['data'][0]['data']['taxon_lookup']

        if (scientific_name[0:3] in taxon_lookup
                and scientific_name in taxon_lookup[scientific_name[0:3]]):
            taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name]
            taxon_object_name = "%s_taxon" % (str(taxon_id))

    #Retrieve Taxon object
    taxon_info = {}
    if (taxon_reference is None):
        taxon_info = dfUtil.get_objects({
            'object_refs': [taxon_wsname + "/" + taxon_object_name],
            'ignore_errors':
            0
        })['data'][0]
        taxon_reference = "%s/%s/%s" % (taxon_info['info'][6],
                                        taxon_info['info'][0],
                                        taxon_info['info'][4])
    else:
        taxon_info = dfUtil.get_objects([{
            "object_refs": [taxon_reference],
            'ignore_errors': 0
        }])['data'][0]

    taxonomy = taxon_info['data']['scientific_lineage']
    ###########################################
    #End taxonomy retrieval
    ###########################################

    ###########################################
    #Create logger
    ###########################################
    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    # send messages to sys.stderr
    streamHandler = logging.StreamHandler(sys.stderr)

    formatter = logging.Formatter(
        "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s"
    )
    formatter.converter = time.gmtime
    streamHandler.setFormatter(formatter)

    logger.addHandler(streamHandler)
    ###########################################
    #End logger creation
    ###########################################

    ##########################################
    #Reading in Fasta file, Code taken from https://www.biostars.org/p/710/
    ##########################################
    logger.info("Reading FASTA file.")

    assembly = {
        "contigs": {},
        "dna_size": 0,
        "gc_content": 0,
        "md5": [],
        "base_counts": {}
    }
    contig_seq_start = 0

    input_file_handle = open(input_fasta_file, 'rb')

    # alternate header and sequence
    faiter = (x[1] for x in itertools.groupby(input_file_handle,
                                              lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        header = header.next()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.next())

        try:
            fasta_header, fasta_description = header.split(' ', 1)
        except:
            fasta_header = header
            fasta_description = None

        #Handle record
        seq = seq.upper()

        #Build contig objects for Assembly
        seq_count = dict(collections.Counter(seq))

        #to delete at end, but required for now
        contig_dict = {"sequence": seq}

        Ncount = 0
        if "N" in seq_count:
            Ncount = seq_count["N"]
        contig_dict["Ncount"] = Ncount

        for character in seq_count:
            if character in assembly["base_counts"]:
                assembly["base_counts"][character] += seq_count[character]
            else:
                assembly["base_counts"][character] = seq_count[character]

        contig_seq_length = len(seq)
        assembly["dna_size"] += contig_seq_length

        contig_gc_length = seq.count("G")
        contig_gc_length += seq.count("C")
        contig_dict["gc_content"] = float("{0:.2f}".format(
            float(contig_gc_length) / float(contig_seq_length)))
        assembly["gc_content"] += contig_gc_length

        contig_dict["contig_id"] = fasta_header
        contig_dict["name"] = fasta_header
        contig_dict["length"] = contig_seq_length
        contig_dict["md5"] = hashlib.md5(seq).hexdigest()
        assembly["md5"].append(contig_dict["md5"])

        if fasta_description is not None:
            contig_dict["description"] = fasta_description

        contig_dict["is_circular"] = "Unknown"
        contig_dict["start_position"] = contig_seq_start
        contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"])

        assembly["contigs"][fasta_header] = contig_dict

        #used for start of next sequence and total gc_content
        contig_seq_start += contig_seq_length

    assembly["gc_content"] = float("{0:.2f}".format(
        float(assembly["gc_content"]) / float(contig_seq_start)))
    assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest()
    assembly["assembly_id"] = core_genome_name + "_assembly"
    assembly["name"] = scientific_name
    assembly["external_source"] = source
    assembly["external_source_id"] = os.path.basename(input_fasta_file)
    assembly["external_source_origination_date"] = str(
        os.stat(input_fasta_file).st_ctime)
    assembly["num_contigs"] = len(assembly["contigs"].keys())
    assembly["type"] = "Unknown"
    assembly[
        "notes"] = "Note MD5s are generated from uppercasing the sequences"

    if taxon_reference is not None:
        assembly["taxon_ref"] = taxon_reference

    logger.info("Reading GFF file.")

    header = list()
    feature_list = dict()
    original_CDS_count = dict()
    original_feature_ids = dict()

    #    gff_file_handle = gzip.open(input_gff_file, 'rb')
    gff_file_handle = open(input_gff_file, 'rb')
    current_line = gff_file_handle.readline()
    gff_object = dict()
    while (current_line != ''):
        current_line = current_line.strip()

        if (current_line.startswith("##") or current_line.startswith("#!")):
            header.append(current_line)
            if ('headers' not in gff_object):
                gff_object['headers'] = list()
            gff_object['headers'].append(current_line)
        else:
            if ('features' not in gff_object):
                gff_object['features'] = list()

            contig_id, source_id, feature_type, start, end, score, strand, phase, attributes = current_line.split(
                '\t')
            attributes_dict = dict()
            for attribute in attributes.split(";"):
                if (attribute == "" or "=" not in attribute):
                    continue
                key, value = attribute.split("=", 1)
                attributes_dict[key] = value

            #ID should be transferred from Name or Parent
            old_id = None
            for key in ("ID", "PACid", "pacid"):
                if (key in attributes_dict):
                    old_id = attributes_dict[key]
                    break
            if (old_id is None):
                eprint(
                    "Cannot find unique ID, PACid, or pacid in GFF attributes: "
                    + attributes)
                continue

            if ("Name" in attributes_dict):
                attributes_dict["ID"] = attributes_dict["Name"]
            else:
                attributes_dict["ID"] = original_feature_ids[
                    attributes_dict["Parent"]] + "." + feature_type

                #if CDS have to increment
                if (feature_type == "CDS"):
                    if (attributes_dict["ID"] not in original_CDS_count):
                        original_CDS_count[attributes_dict["ID"]] = 1
                    else:
                        original_CDS_count[attributes_dict["ID"]] += 1

                    attributes_dict["ID"] += "." + str(
                        original_CDS_count[attributes_dict["ID"]])

            #Update parent
            if ("Parent" in attributes_dict):
                attributes_dict["Parent"] = original_feature_ids[
                    attributes_dict["Parent"]]

            original_feature_ids[old_id] = attributes_dict["ID"]

            #recreate line for GFF
            partial_line, attributes = current_line.rsplit('\t', 1)
            new_line = partial_line + "\t" + ";".join(
                key + "=" + attributes_dict[key]
                for key in attributes_dict.keys())
            gff_object['features'].append(new_line)

            if (contig_id not in assembly["contigs"]):
                logger.warn("Missing contig: " + contig_id)

            if (contig_id not in feature_list):
                feature_list[contig_id] = list()

            feature = {
                'type': feature_type,
                'start': int(start),
                'end': int(end),
                'score': score,
                'strand': strand,
                'phase': phase
            }
            for attribute in attributes.split(";"):
                if (attribute == "" or "=" not in attribute):
                    continue
                key, value = attribute.split("=", 1)
                feature[key] = value

            #Append contig identifier
            feature["contig"] = contig_id
            feature_list[contig_id].append(feature)

        current_line = gff_file_handle.readline()
    gff_file_handle.close()

    #Writing updated lines to gff_file_handle
    input_gff_file = input_gff_file.replace("gene", "edited_gene")
    gff_file_handle = gzip.open(input_gff_file, 'wb')
    if ('headers' in gff_object):
        gff_file_handle.write("\n".join(gff_object["headers"]))
    gff_file_handle.write("\n".join(gff_object["features"]))
    gff_file_handle.close()

    #New code inserted to better handle feature identifiers
    #Start by extracting and group them first
    features_identifiers_dict = dict()
    features_identifiers_list = list()
    features_identifiers_count = dict()
    features_parents_dict = dict()
    features_name_id_dict = dict()
    CDS_count = dict()
    for contig in sorted(feature_list):
        for feature in feature_list[contig]:
            #We're only considering gene, mRNA, and CDS for brevity's sake
            if (feature["type"] not in ("gene", "mRNA", "CDS")):
                continue

            #gene and mRNA always have name, CDS do not
            if ("Name" not in feature):
                feature["Name"] = None

            #Update parent following name/id switch
            if ("Parent" in feature
                    and feature["Parent"] in features_name_id_dict):
                feature["Parent"] = features_name_id_dict[feature["Parent"]]

            #ID should be transferred to Name, but need to maintain parent
            if (feature["Name"] is not None):
                features_name_id_dict[feature["ID"]] = feature["Name"]
                feature["ID"] = feature["Name"]
            else:
                feature["ID"] = feature["Parent"] + "." + feature["type"]
                #if CDS have to increment
                if (feature["type"] == "CDS"):
                    if (feature["ID"] not in CDS_count):
                        CDS_count[feature["ID"]] = 1
                    else:
                        CDS_count[feature["ID"]] += 1

                    feature["ID"] += "." + str(CDS_count[feature["ID"]])

            #Collect
            if (feature["type"] == "gene"):
                features_identifiers_dict[feature["ID"]] = dict()
            if (feature["type"] == "mRNA"):
                features_identifiers_dict[feature["Parent"]][
                    feature["ID"]] = dict()
                features_parents_dict[feature["ID"]] = feature["Parent"]
            if (feature["type"] == "CDS"):
                features_identifiers_dict[features_parents_dict[
                    feature["Parent"]]][feature["Parent"]][feature["ID"]] = 1

            features_identifiers_list.append(feature)
            features_identifiers_count[
                feature["ID"]] = len(features_identifiers_list) - 1

    updated_features_identifiers_dict = dict()
    updated_features_list = list()
    updated_features_identifiers_count = dict()
    updated_features_parents_dict = dict()
    updated_CDS_count = dict()
    for gene in sorted(features_identifiers_dict):

        #retrieve original object
        gene_ftr = features_identifiers_list[features_identifiers_count[gene]]

        #store gene
        updated_features_identifiers_dict[gene_ftr["ID"]] = dict()
        updated_features_list.append(gene_ftr)
        updated_features_identifiers_count[
            gene_ftr["ID"]] = len(updated_features_list) - 1

        for mRNA in sorted(features_identifiers_dict[gene],
                           key=lambda x: features_identifiers_count[x]):
            #retrieve feature
            mRNA_ftr = features_identifiers_list[
                features_identifiers_count[mRNA]]

            if ("PAC" in mRNA[0:3]):
                if ("Name" in mRNA_ftr):
                    mRNA_ftr["ID"] = mRNA_ftr["Name"]

            updated_features_identifiers_dict[gene_ftr["ID"]][
                mRNA_ftr["ID"]] = dict()
            updated_features_parents_dict[mRNA_ftr["ID"]] = mRNA_ftr["Parent"]

            updated_features_list.append(mRNA_ftr)
            updated_features_identifiers_count[
                mRNA_ftr["ID"]] = len(updated_features_list) - 1

            for CDS in sorted(features_identifiers_dict[gene][mRNA],
                              key=lambda x: features_identifiers_count[x]):
                #retrieve feature
                CDS_ftr = features_identifiers_list[
                    features_identifiers_count[CDS]]

                if ("PAC" in CDS[0:3]):
                    CDS_ftr["ID"] = mRNA_ftr["ID"] + ".CDS"

                    if (CDS_ftr["ID"] not in updated_CDS_count):
                        updated_CDS_count[CDS_ftr["ID"]] = 1
                    else:
                        updated_CDS_count[CDS_ftr["ID"]] += 1

                    CDS_ftr["ID"] += "." + str(
                        updated_CDS_count[CDS_ftr["ID"]])
                    CDS_ftr["Parent"] = mRNA_ftr["ID"]

                updated_features_identifiers_dict[gene_ftr["ID"]][
                    mRNA_ftr["ID"]][CDS_ftr["ID"]] = 1
                updated_features_parents_dict[
                    CDS_ftr["ID"]] = CDS_ftr["Parent"]

                updated_features_list.append(CDS_ftr)
                updated_features_identifiers_count[
                    CDS_ftr["ID"]] = len(updated_features_list) - 1

    genome_features_list = list()
    genome_mrnas_list = list()
    genome_cdss_list = list()
    for gene in sorted(updated_features_identifiers_dict):
        #retrieve updated object
        gene_ftr = updated_features_list[
            updated_features_identifiers_count[gene]]

        gene_object = convert_ftr_object(
            gene_ftr, assembly["contigs"][gene_ftr["contig"]]["sequence"])
        gene_object["type"] = "gene"

        #New terms, TODO, move to end of gene loop
        gene_object["cdss"] = list()
        gene_object["mrnas"] = list()

        #use function of longest CDS for gene
        longest_protein_length = 0
        longest_protein_sequence = ""
        for mRNA in sorted(
                updated_features_identifiers_dict[gene],
                key=lambda x: updated_features_identifiers_count[x]):
            #retrieve updated object
            mRNA_ftr = updated_features_list[
                updated_features_identifiers_count[mRNA]]

            feature_object = convert_ftr_object(
                mRNA_ftr, assembly["contigs"][mRNA_ftr["contig"]]["sequence"])
            feature_object['parent_gene'] = gene_object['id']

            mrna_object = copy.deepcopy(feature_object)
            cds_object = copy.deepcopy(feature_object)

            cds_object['id'] = mrna_object['id'] + ".CDS"
            mrna_object['cds'] = cds_object['id']

            cds_object['parent_mrna'] = mrna_object['id']

            del mrna_object["dna_sequence"]
            del mrna_object["dna_sequence_length"]

            cds_object["ontology_terms"] = dict()

            gene_object["mrnas"].append(mrna_object["id"])
            gene_object["cdss"].append(cds_object["id"])

            #CDS aggregation needs to be done in order to build protein sequence and list of locations
            CDS_list = sorted(
                updated_features_identifiers_dict[gene][mRNA],
                key=lambda x: updated_features_identifiers_count[x])

            dna_sequence = ""
            locations = list()

            #collect phases, and lengths of exons
            #right now, this is only for the purpose of error reporting
            phases = list()
            exons = list()

            for CDS in (CDS_list):
                #retrieve updated partial CDS
                add_ftr = updated_features_list[
                    updated_features_identifiers_count[CDS]]
                phases.append(add_ftr["phase"])

                add_ftr_obj = convert_ftr_object(
                    add_ftr,
                    assembly["contigs"][add_ftr["contig"]]["sequence"])
                exons.append(len(add_ftr_obj["dna_sequence"]))

                #Remove base(s) according to phase, but only for first CDS
                if (CDS == CDS_list[0] and int(add_ftr["phase"]) != 0):
                    logger.info("Adjusting phase for first CDS: " + CDS)
                    add_ftr_obj["dna_sequence"] = add_ftr_obj["dna_sequence"][
                        int(add_ftr["phase"]):]

                dna_sequence += add_ftr_obj["dna_sequence"]
                locations.append(add_ftr_obj["location"][0])

            #translate sequence
            dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna)
            rna_sequence = dna_sequence_obj.transcribe()

            #Incomplete gene model with no start codon
            #Translate as is
            if str(rna_sequence.upper())[:3] not in codon_table.start_codons:
                logger.info("Missing start codon for " + feature_object["id"] +
                            " Assuming incomplete gene model.")
                #temp_seq = 'AUG'+str(rna_sequence.upper())[3:]
                #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna)

            #You should never have this problem, needs to be reported rather than "fixed"
            codon_count = len(str(rna_sequence)) % 3
            if codon_count != 0:
                logger.info(
                    "Number of bases for RNA sequence for " +
                    feature_object["id"] +
                    " is not divisible by 3. The resulting protein may well be mis-translated."
                )
                #temp_seq = str(rna_sequence.upper())+"N"
                #if codon_count == 1:
                #    temp_seq+="N"
                #new_codon_count=len(temp_seq) % 3
                #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna)

            protein_sequence = Seq("")
            try:
                protein_sequence = rna_sequence.translate()  #cds=True)
            except CodonTable.TranslationError as te:
                logger.info("TranslationError for: " + feature_object["id"],
                            phases, exons, " : " + str(te))

            cds_object["protein_translation"] = str(protein_sequence).upper()
            cds_object["protein_translation_length"] = len(
                cds_object["protein_translation"])
            cds_object["md5"] = hashlib.md5(
                cds_object["protein_translation"]).hexdigest()

            if (cds_object["protein_translation_length"] >
                    longest_protein_length):
                longest_protein_length = cds_object[
                    "protein_translation_length"]
                longest_protein_sequence = cds_object["protein_translation"]

            del cds_object["dna_sequence"]
            del cds_object["dna_sequence_length"]
            if ("aliases" not in cds_object):
                cds_object["aliases"] = list()
            if ("function" not in cds_object):
                cds_object["function"] = ""

            #End of mRNA loop
            genome_mrnas_list.append(mrna_object)
            genome_cdss_list.append(cds_object)

        #End of gene loop
        gene_object["ontology_terms"] = dict()
        gene_object["protein_translation"] = longest_protein_sequence
        gene_object["protein_translation_length"] = longest_protein_length
        genome_features_list.append(gene_object)

    #remove sequences before loading
    for contig in assembly["contigs"]:
        del assembly["contigs"][contig]["sequence"]


#    assembly_string = simplejson.dumps(assembly, sort_keys=True, indent=4, ensure_ascii=False)
#    assembly_file = open("Bulk_Phytozome_Upload/"+assembly["name"]+'.json', 'w+')
#    assembly_file.write(assembly_string)
#    assembly_file.close()

    if (assembly_ref == None):
        #Upload FASTA to shock
        #Need to gunzip file first
        gunzipped_fasta_file = input_fasta_file
        #        gunzipped_fasta_file=input_fasta_file[0:-3]
        #        with gzip.open(input_fasta_file, 'rb') as f_in:
        #            with open(gunzipped_fasta_file, 'wb') as f_out:
        #                shutil.copyfileobj(f_in, f_out)

        token = os.environ.get('KB_AUTH_TOKEN')

        logger.info("Attempting Assembly save for %s" %
                    (assembly["assembly_id"]))
        aUtil = AssemblyUtil(callback_url)
        assembly_ref = aUtil.save_assembly_from_fasta({
            'file': {
                'path': gunzipped_fasta_file,
                'assembly_name': assembly['assembly_id']
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            assembly['assembly_id']
        })
        logger.info("Assembly saved for %s" % (assembly["name"]))

        #Remove gunzipped file
        #os.remove(input_fasta_file[0:-3])

    genome = dict()
    genome["id"] = core_genome_name
    genome["scientific_name"] = scientific_name
    genome["assembly_ref"] = assembly_ref
    genome["features"] = genome_features_list
    genome["cdss"] = genome_cdss_list
    genome["mrnas"] = genome_mrnas_list
    genome["source"] = source
    genome["domain"] = "Eukaryota"
    genome["genetic_code"] = 1
    genome["gc_content"] = assembly["gc_content"]
    genome["dna_size"] = assembly["dna_size"]

    if taxon_reference is not None:
        genome["taxon_ref"] = taxon_reference
        genome["taxonomy"] = taxonomy

    UserMeta = dict()
    UserMeta['Taxonomy'] = taxonomy
    UserMeta['Source'] = source
    UserMeta['Domain'] = "Eukaryota"
    UserMeta['Source ID'] = core_genome_name
    UserMeta['Name'] = scientific_name
    UserMeta['Genetic code'] = 1

    UserMeta['GC content'] = assembly["gc_content"]
    UserMeta['Size'] = assembly["dna_size"]
    UserMeta['Number contigs'] = assembly['num_contigs']

    #id_source_version_array = core_genome_name.split("_")
    #version = "_".join(id_source_version_array[2:])
    #UserMeta['Version']=version
    #UserMeta['url']='';

    if (gff_handle_ref == None):
        token = os.environ.get('KB_AUTH_TOKEN')
        file_upload = dfUtil.file_to_shock({
            'file_path': input_gff_file,
            'make_handle': 1,
            'pack': "gzip"
        })
        gff_handle_ref = file_upload['handle']['hid']

    genome['gff_handle_ref'] = gff_handle_ref

    #    genome_string = simplejson.dumps(genome, sort_keys=True, indent=4, ensure_ascii=False)
    #    genome_file = open("Bulk_Phytozome_Upload/"+core_genome_name+'.json', 'w+')
    #    genome_file.write(genome_string)
    #    genome_file.close()

    logger.info("Attempting Genome save for %s" % (core_genome_name))
    workspace_id = dfUtil.ws_name_to_id(workspace_name)
    genome_info = dfUtil.save_objects({
        "id":
        workspace_id,
        "objects": [{
            "name": core_genome_name,
            "type": "KBaseGenomes.Genome",
            "data": genome
        }]
    })[0]
    logger.info("Genome saved for %s" % (core_genome_name))

    return {'genome_info': genome_info, 'report_string': ""}