def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass
示例#2
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.shock_url = config['shock-url']
     self.dfu = DataFileUtil(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.setapi = SetAPI(self.callback_url)
     self.wss = workspaceService(config['workspace-url'])
示例#3
0
 def __init__(self, config, logger=None):
     self.config = config
     self.logger = logger
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.scratch = os.path.join(config['scratch'],
                                 'EAS_' + str(uuid.uuid4()))
     self.ws_url = config['workspace-url']
     self.ws_client = Workspace(self.ws_url)
     self.dfu = DataFileUtil(self.callback_url)
     self.setAPI = SetAPI(self.callback_url)
     pass
示例#4
0
 def __init__(self, config, logger=None):
     self.config = config
     self.logger = logger
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.scratch = os.path.join(config['scratch'],
                                 'DEM_' + str(uuid.uuid4()))
     self.ws_url = config['workspace-url']
     self.ws_client = Workspace(self.ws_url)
     self.fv = KBaseFeatureValues(self.callback_url)
     self.dfu = DataFileUtil(self.callback_url)
     self.setAPI = SetAPI(self.callback_url)
     self.gsu = GenomeSearchUtil(self.callback_url)
     self._mkdir_p(self.scratch)
 def test_list_object_with_sets(self):
     ws_name1 = self.createWs()
     reads_obj_ref = self.__class__.example_reads_ref
     set_obj_name = "MyReadsSet.1"
     sapi = SetAPI(self.__class__.serviceWizardURL,
                   token=self.getContext()['token'],
                   service_ver=self.__class__.SetAPI_version)
     sapi.save_reads_set_v1({
         'workspace': ws_name1,
         'output_object_name': set_obj_name,
         'data': {
             'description': '',
             'items': [{
                 'ref': reads_obj_ref
             }]
         }
     })
     list_ret = self.getImpl().list_objects_with_sets(
         self.getContext(), {"ws_name": ws_name1})[0]
     ret = list_ret['data']
     self.assertTrue(len(ret) > 0)
     set_count = 0
     for item in ret:
         self.assertTrue("object_info" in item)
         if "set_items" in item:
             set_count += 1
             set_items = item["set_items"]["set_items_info"]
             self.assertEqual(1, len(set_items))
     self.assertEqual(1, set_count)
     self.assertIn('data_palette_refs', list_ret)
     ws_id = self.getWsClient().get_workspace_info({"workspace":
                                                    ws_name1})[0]
     ret2 = self.getImpl().list_objects_with_sets(
         self.getContext(), {"ws_id": ws_id})[0]["data"]
     self.assertEqual(len(ret), len(ret2))
     type_filter = "KBaseSets.ReadsSet"
     ret3 = self.getImpl().list_objects_with_sets(self.getContext(), {
         "types": [type_filter],
         "workspaces": [str(ws_id)]
     })[0]["data"]
     self.assertTrue(len(ret3) > 0)
     for item in ret3:
         info = item['object_info']
         obj_type = info[2].split('-')[0]
         self.assertEqual(type_filter, obj_type)
     type_filter = "KBaseGenomes.Genome"
     ret4 = self.getImpl().list_objects_with_sets(self.getContext(), {
         "types": [type_filter],
         "workspaces": [str(ws_id)]
     })[0]["data"]
     self.assertTrue(len(ret4) == 0)
示例#6
0
def fetch_reads_refs_from_sampleset(ref, ws_url, srv_wiz_url):
    """
    From the given object ref, return a list of all reads objects that are a part of that
    object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
    refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
    {
        "ref": reads object reference,
        "condition": condition string associated with that reads object,
        "name": reads object name (needed for saving an AlignmentSet)
    }
    The only one required is "ref", all other keys may or may not be present, based on the reads
    object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
    for each reads object, but a single PairedEndLibrary may not have that info.

    If ref is already a Reads library, just returns a list with ref as a single element.
    """
    obj_type = get_object_type(ref, ws_url)
    refs = list()
    if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
        print("Looking up reads references in ReadsSet object")
        set_client = SetAPI(srv_wiz_url)
        reads_set = set_client.get_reads_set_v1({
                                            "ref": ref,
                                            "include_item_info": 0,
                                            "include_set_item_ref_paths": 1
        })
        print("Got results from ReadsSet object")
        pprint(reads_set)
        ref_list = [r["ref_path"] for r in reads_set["data"]["items"]]
        reads_names = get_object_names(ref_list, ws_url)
        for reads in reads_set["data"]["items"]:
            ref = reads["ref_path"]
            refs.append({
                "ref": ref,
                "condition": reads["label"],
                "name": reads_names[ref]
            })
    elif ("KBaseAssembly.SingleEndLibrary" in obj_type or
          "KBaseFile.SingleEndLibrary" in obj_type or
          "KBaseAssembly.PairedEndLibrary" in obj_type or
          "KBaseFile.PairedEndLibrary" in obj_type):
        refs.append({
            "ref": ref,
            "name": get_object_names([ref], ws_url)[ref]
        })
    else:
        raise ValueError("Unable to fetch reads reference from object {} "
                         "which is a {}".format(ref, obj_type))

    return refs
示例#7
0
def load_reads_set(callback_url, ws_name, reads_set, target_name):
    """
    Combine a list of reads references into a ReadsSet.
    if file_rev is None or not a present key, then this is treated as a single end reads.
    """
    set_client = SetAPI(callback_url)
    set_output = set_client.save_reads_set_v1({
        "workspace": ws_name,
        "output_object_name": target_name,
        "data": {
            "description": "reads set for testing",
            "items": reads_set
        }
    })
    return set_output["set_ref"]
 def setUpClass(cls):
     token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('AlignmentSetEditor'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'user_id': user_id,
                     'provenance': [
                         {'service': 'AlignmentSetEditor',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL)
     cls.serviceImpl = AlignmentSetEditor(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.setAPI = SetAPI(cls.callback_url)
     cls.gfu = GenomeFileUtil(cls.callback_url)
     cls.ru = ReadsUtils(cls.callback_url)
     cls.rau = ReadsAlignmentUtils(cls.callback_url)
     suffix = int(time.time() * 1000)
     cls.wsName = "test_AlignmentSetEditor_" + str(suffix)
     cls.wsClient.create_workspace({'workspace': cls.wsName})
示例#9
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)

        for nameval in config.items('kb_cufflinks'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_cufflinks',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(url=cls.wsURL, token=token)
        cls.serviceImpl = kb_cufflinks(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = environ.get('SDK_CALLBACK_URL')
        cls.srv_wiz_url = cls.cfg['srv-wiz-url']

        # cls.wsName = 'cufflinks_test_' + user_id  # reuse existing workspace
        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_cufflinks_" + str(suffix)
        print('workspace_name: ' + cls.wsName)

        try:
            # reuse existing (previously torn down) workspace
            cls.wsClient.undelete_workspace({'workspace': cls.wsName})
            print('reusing old workspace...')
        except BaseException:
            try:
                # create if workspace does not exist
                cls.wsClient.create_workspace({'workspace': cls.wsName})
            except BaseException:
                # get workspace if it exists and was not previously deleted (previously
                # not torn down)
                ws_info = cls.wsClient.get_workspace_info({'workspace': cls.wsName})
                print("creating new workspace: " + str(ws_info))

        cls.dfu = DataFileUtil(cls.callback_url)

        cls.gfu = GenomeFileUtil(cls.callback_url)
        cls.ru = ReadsUtils(cls.callback_url)
        cls.rau = ReadsAlignmentUtils(cls.callback_url)
        cls.set_api = SetAPI(cls.srv_wiz_url, service_ver='dev')

        cls.cufflinks_runner = CufflinksUtils(cls.cfg)

        cls.prepare_data()
示例#10
0
 def test_two_users_set_inside_dp(self):
     ws_name1_1 = self.createWs()
     # Injecting reads object (real copy) into workspace1
     orig_reads_obj_ref = self.__class__.example_reads_ref
     reads_obj_name = "TestReads"
     self.getWsClient().copy_object({'from': {'ref': orig_reads_obj_ref},
                                     'to': {'workspace': ws_name1_1,
                                            'name': reads_obj_name}})
     copy_reads_obj_ref = ws_name1_1 + '/' + reads_obj_name
     ws_name1_2 = self.createWs()
     set_obj_name = "MyReadsSet.1"
     sapi = SetAPI(self.__class__.serviceWizardURL, token=self.getContext()['token'],
                   service_ver=self.__class__.SetAPI_version)
     sapi.save_reads_set_v1({'workspace': ws_name1_2, 'output_object_name': set_obj_name,
                             'data': {'description': '', 'items': [{'ref': copy_reads_obj_ref}]}})
     orig_set_ref = ws_name1_2 + '/' + set_obj_name
     # Making DP-copy of reads set object by user2
     ws_name2 = self.createWs2()
     # Let's share workspace containing set with user2
     self.getWsClient().set_permissions({'workspace': ws_name1_2, 'new_permission': 'r',
                                         'users': [self.getContext2()['user_id']]})
     # Import reads set ref into DataPalette of third workspace
     dps = DataPaletteService(self.__class__.serviceWizardURL, 
                               token=self.getContext2()['token'],
                               service_ver=self.__class__.DataPalette_version)
     dps.add_to_palette({'workspace': ws_name2, 'new_refs': [{'ref': orig_set_ref}]})
     dp_ref_map = dps.list_data({'workspaces': [ws_name2]})['data_palette_refs']
     set_ref_path = dp_ref_map.itervalues().next() + ';' + orig_set_ref
     reads_ref_path = set_ref_path + ';' + copy_reads_obj_ref
     # Un-share original workspace 
     self.getWsClient().set_permissions({'workspace': ws_name1_2, 'new_permission': 'n',
                                         'users': [self.getContext2()['user_id']]})
     # Let's check that we can list set and see reads object as set item
     ret = self.getImpl().list_objects_with_sets(self.getContext2(),
                                                 {"ws_name": ws_name2})[0]["data"]
     self.assertEqual(1, len(ret))
     item = ret[0]
     self.assertTrue('set_items' in item)
     self.assertTrue('set_items_info' in item['set_items'])
     self.assertEqual(1, len(item['set_items']['set_items_info']))
     # Check access to reads and to set objects
     info = self.getWsClient2().get_object_info_new({'objects': [{'ref': set_ref_path}]})[0]
     self.assertEqual(set_obj_name, info[1])
     info = self.getWsClient2().get_object_info_new({'objects': [{'ref': reads_ref_path}]})[0]
     self.assertEqual(reads_obj_name, info[1])
 def test_unique_items(self):
     # Create original workspace with reads object + ReadsSet object
     ws_name1 = self.createWs()
     foft = FakeObjectsForTests(os.environ['SDK_CALLBACK_URL'])
     reads_obj_name = "test.reads.1"
     foft.create_fake_reads({
         'ws_name': ws_name1,
         'obj_names': [reads_obj_name]
     })
     reads_obj_ref = ws_name1 + '/' + reads_obj_name
     set_obj_name = "test.reads_set.1"
     sapi = SetAPI(self.__class__.serviceWizardURL,
                   token=self.getContext()['token'],
                   service_ver=self.__class__.SetAPI_version)
     sapi.save_reads_set_v1({
         'workspace': ws_name1,
         'output_object_name': set_obj_name,
         'data': {
             'description': '',
             'items': [{
                 'ref': reads_obj_ref
             }]
         }
     })
     set_obj_ref = ws_name1 + '/' + set_obj_name
     # Create workspace with DataPalette copy of Reads object and copy of ReadsSet
     ws_name2 = self.createWs()
     dps = DataPaletteService(
         self.__class__.serviceWizardURL,
         token=self.getContext()['token'],
         service_ver=self.__class__.DataPalette_version)
     dps.add_to_palette({
         'workspace':
         ws_name2,
         'new_refs': [{
             'ref': reads_obj_ref
         }, {
             'ref': set_obj_ref
         }]
     })
     # Check if listing in both these workspaces at the same time gives unique items
     ret = self.getImpl().list_objects_with_sets(
         self.getContext(), {"workspaces": [ws_name1, ws_name2]})[0]["data"]
     self.assertEqual(2, len(ret))
示例#12
0
    def setUpClass(cls):
        cls.token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_ballgown'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_ballgown',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.hs = HandleService(url=cls.cfg['handle-service-url'],
                               token=cls.token)
        cls.shockURL = cls.cfg['shock-url']
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=cls.token)
        cls.serviceImpl = kb_ballgown(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.gfu = GenomeFileUtil(cls.callback_url)
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.ru = ReadsUtils(cls.callback_url)
        cls.rau = ReadsAlignmentUtils(cls.callback_url, service_ver='dev')
        cls.eu = ExpressionUtils(cls.callback_url, service_ver='dev')
        cls.set_api = SetAPI(cls.callback_url)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_ballgown_" + str(suffix)
        #cls.wsName = "test_kb_ballgown_1004"
        cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.nodes_to_delete = []
        cls.handles_to_delete = []

        cls.prepare_data()
示例#13
0
    def upload_alignment_set(self, alignment_items, alignmentset_name,
                             ws_name):
        """
        Compiles and saves a set of alignment references (+ other stuff) into a
        KBaseRNASeq.RNASeqAlignmentSet.
        Returns the reference to the new alignment set.

        alignment_items: [{
            "ref": alignment_ref,
            "label": condition label.
        }]
        # alignment_info = dict like this:
        # {
        #     reads_ref: {
        #         "ref": alignment_ref
        #     }
        # }
        # reads_info = dict like this:
        # {
        #     reads_ref: {
        #         "condition": "some condition"
        #     }
        # }
        # input_params = global input params to HISAT2, also has ws_name for the target workspace.
        # alignmentset_name = name of final set object.
        """
        print("Uploading completed alignment set")
        alignment_set = {
            "description":
            "Alignments using HISAT2, v.{}".format(HISAT_VERSION),
            "items": alignment_items
        }
        set_api = SetAPI(self.srv_wiz_url)
        set_info = set_api.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": alignmentset_name,
            "data": alignment_set
        })
        return set_info["set_ref"]
示例#14
0
class GenDiffExprMatrix:

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'DEM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.setAPI = SetAPI(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self._mkdir_p(self.scratch)

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def setup_data(self):

        self.new_col_names = [
            'gene_id', 'log2_fold_change', 'p_value', 'q_value'
        ]

    def get_feature_ids(self, genome_ref):
        """
        _get_feature_ids: get feature ids from genome
        """

        feature_num = self.gsu.search({'ref': genome_ref})['num_found']

        genome_features = self.gsu.search({
            'ref': genome_ref,
            'limit': feature_num,
            'sort_by': [['feature_id', True]]
        })['features']

        features_ids = map(
            lambda genome_feature: genome_feature.get('feature_id'),
            genome_features)

        return list(set(features_ids))

    def gen_matrix(self, infile, old_col_names, delimiter):
        with open(infile, 'rb') as source:
            rdr = csv.DictReader(source, delimiter=delimiter)
            col_names = self.new_col_names[1:]
            row_names = []
            values = []
            for row in rdr:
                try:
                    values.append([float(row[v]) for v in old_col_names[1:]])
                except:
                    values_list = []
                    for v in old_col_names[1:]:
                        tmpval = row[v]
                        if isinstance(tmpval, (int, long, float)):
                            values_list.append(float(tmpval))
                        elif isinstance(tmpval, basestring):
                            if 'na' in tmpval.lower(
                            ) or 'none' in tmpval.lower():
                                values_list.append(None)
                            else:
                                tmpval = tmpval.replace("'", "")
                                tmpval = tmpval.replace('"', '')
                                values_list.append(float(tmpval))
                        else:
                            raise ValueError(
                                "invalid type in input file: {}".format(
                                    tmpval))
                    values.append(values_list)
                row_names.append(row[old_col_names[0]])

        twoD_matrix = {
            'row_ids': row_names,
            'col_ids': col_names,
            'values': values
        }

        return twoD_matrix

    def get_max_fold_change_to_handle_inf(self, infile):
        maxvalue = 0
        with open(infile) as source:
            rdr = csv.DictReader(source, dialect='excel-tab')
            for line in rdr:
                log2fc_val = line.get('log2_fold_change')
                if not 'inf' in str(log2fc_val):
                    log2fc = abs(float(log2fc_val))
                    if log2fc > maxvalue:
                        maxvalue = log2fc

            print 'maxvalue: ', maxvalue
            return maxvalue

    def gen_cuffdiff_matrix(self, infile, delimiter='\t'):

        max_value = self.get_max_fold_change_to_handle_inf(infile)
        with open(infile, 'rb') as source:
            rdr = csv.DictReader(source, delimiter=delimiter)
            col_names = self.new_col_names[1:]

            row_names = []
            values = []
            for row in rdr:

                log2fc_val = row.get('log2_fold_change')
                # print 'FC_VAL: ', log2fc_val
                if '-inf' in str(log2fc_val):
                    row['log2_fold_change'] = -float(max_value)
                elif 'inf' in str(log2fc_val):
                    row['log2_fold_change'] = float(max_value)
                elif 'nan' in str(log2fc_val):
                    row['log2_fold_change'] = None

                try:
                    values.append(
                        [float(row[v]) for v in self.new_col_names[1:]])
                except:
                    values.append(
                        [None] +
                        [float(row[v]) for v in self.new_col_names[2:]])

                row_names.append(row[self.new_col_names[0]])

        tmatrix = {
            'row_ids': row_names,
            'col_ids': col_names,
            'values': values
        }

        return tmatrix

    def save_diff_expr_matrix(self, obj_name, data_matrix, condition1,
                              condition2):

        dem_data = {
            'genome_ref': self.params.get('genome_ref'),
            'data': data_matrix,
            'condition_mapping': {
                condition1: condition2
            },
            'type': 'log2_level',
            'scale': '1.0'
        }
        res = self.dfu.save_objects({
            'id':
            self.params.get('ws_id'),
            "objects": [{
                "type":
                "KBaseFeatureValues.DifferentialExpressionMatrix",
                "data":
                dem_data,
                "name":
                obj_name,
                "extra_provenance_input_refs": [self.params.get('genome_ref')]
            }]
        })[0]
        ret_ref = str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        return ret_ref

    def save_diff_expr_matrix_set(self, obj_name, matrix_set):

        res = self.setAPI.save_differential_expression_matrix_set_v1({
            "workspace":
            self.params.get('ws_name'),
            "output_object_name":
            obj_name,
            "data":
            matrix_set
        })
        return res.get('set_ref')

    #
    # ballgown always outputs a linear fold change, which we need to convert to log2
    # before storing
    #

    def safely_apply_log2_to_fc(self, row):
        if row[0]:
            fc = row[0]
            if fc < 1.0e-10:
                fc = fc + 1.0e-10  # incase fc is zero
            return ([log2(fc)] + row[1:])
        else:
            return (row)

    def process_ballgown_file(self, diffexpr_filepath):

        ballgown_col_names = ['id', 'fc', 'pval', 'qval']

        data_matrix = self.gen_matrix(diffexpr_filepath,
                                      ballgown_col_names,
                                      delimiter='\t')
        log2_data_matrix = data_matrix
        log2_data_matrix['values'] = map(self.safely_apply_log2_to_fc,
                                         data_matrix.get('values'))

        dem_ref = self.save_diff_expr_matrix(
            self.params.get('obj_name') + '_0', log2_data_matrix, None, None)
        set_items = [{
            'label': 'global Differential Expression Data',
            'ref': dem_ref
        }]
        matrix_set = {
            'description': 'ballgown Diff Exp Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)

    def process_deseq_file(self, diffexpr_filepath):

        deseq_col_names = ['geneID', 'log2FoldChange', 'pvalue', 'padj']

        data_matrix = self.gen_matrix(diffexpr_filepath,
                                      deseq_col_names,
                                      delimiter=',')

        dem_ref = self.save_diff_expr_matrix(
            self.params.get('obj_name') + '_0', data_matrix, None, None)
        set_items = [{
            'label': 'global Differential Expression Data',
            'ref': dem_ref
        }]
        matrix_set = {
            'description': 'deseq Diff Exp Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)

    def process_cuffdiff_file(self, diffexpr_filepath):

        cuffdiff_col_names = [
            'gene', 'log2(fold_change)', 'p_value', 'q_value'
        ]

        ConditionPair = namedtuple("ConditionPair",
                                   ["condition1", "condition2"])
        FileInfo = namedtuple('FileInfo', ['file_path', 'file_obj'])

        condPair_fileInfo = {}

        timestamp = str(
            int((datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000))
        with open(diffexpr_filepath, 'rb') as source:
            rdr = csv.DictReader(source, dialect='excel-tab')
            """
            save the files opened for writing in outfiles list, so they can be closed later
            """
            outfiles = list()

            for r in rdr:
                c1 = r['sample_1']
                c2 = r['sample_2']

                cond_pair = ConditionPair(condition1=c1, condition2=c2)
                tsv_file_info = condPair_fileInfo.get(cond_pair, None)
                if tsv_file_info is None:
                    tsv_file_name = timestamp + '_' + c1 + '~~' + c2
                    tsv_file_path = os.path.join(self.scratch, tsv_file_name)
                    outfile = open(tsv_file_path, 'wb')
                    outfiles.append(outfile)
                    csv_wtr = csv.DictWriter(outfile,
                                             delimiter='\t',
                                             fieldnames=self.new_col_names)
                    csv_wtr.writerow(
                        dict((cn, cn) for cn in self.new_col_names))
                    tsv_file_info = FileInfo(file_path=tsv_file_path,
                                             file_obj=csv_wtr)
                    condPair_fileInfo[cond_pair] = tsv_file_info

                wtr = tsv_file_info.file_obj
                col_vals = [r[v] for v in cuffdiff_col_names]
                wtr.writerow(dict(zip(self.new_col_names, col_vals)))

            for ofile in outfiles:
                ofile.close()

            set_items = list()
            for cond_pair, file_info in condPair_fileInfo.iteritems():
                print 'Cond_pair: ', cond_pair
                print 'File: ', file_info.file_path
                tsv_file = file_info.file_path

                data_matrix = self.gen_cuffdiff_matrix(tsv_file)

                object_name = self.get_obj_name(self.params['obj_name'],
                                                cond_pair.condition1,
                                                cond_pair.condition2)
                dem_ref = self.save_diff_expr_matrix(object_name, data_matrix,
                                                     cond_pair.condition1,
                                                     cond_pair.condition2)
                print('process_cuffdiff_file: DEM_REF: ' + dem_ref)
                set_items.append({
                    'label':
                    cond_pair.condition1 + ', ' + cond_pair.condition2,
                    'ref':
                    dem_ref
                })

        matrix_set = {
            'description': 'cuffdiff Diff Exp Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)

    """
    Functions for save_differentialExpressionMatrixSet
    """

    def save_matrix(self, genome_ref, infile, in_col_names, delimiter):

        feature_ids = self.get_feature_ids(genome_ref)

        with open(infile, 'rb') as source:
            rdr = csv.DictReader(source, delimiter=delimiter)
            col_names = in_col_names[1:]
            row_names = []
            values = []
            for row in rdr:
                if row[in_col_names[0]] in feature_ids:
                    row_names.append(row[in_col_names[0]])
                else:
                    gene_ids = row[in_col_names[0]].strip().split(',')
                    match = True
                    mismatched_gene_ids = list()
                    for gene_id in gene_ids:
                        gene_id = gene_id.strip()
                        if gene_id not in feature_ids:
                            mismatched_gene_ids.append(gene_id)
                            match = False
                    if match:
                        row_names.append(row[in_col_names[0]])
                    else:
                        error_msg = 'Gene_id(s) "{}" is not a known feature in "{}"'.format(
                            ', '.join(mismatched_gene_ids),
                            self.params.get('genome_ref'))
                        raise ValueError(error_msg)
                try:
                    values.append([float(row[v]) for v in in_col_names[1:]])
                except:
                    values_list = []
                    for v in in_col_names[1:]:
                        tmpval = row[v]
                        if isinstance(tmpval, (int, long, float)):
                            values_list.append(float(tmpval))
                        elif isinstance(tmpval, basestring):
                            if 'na' in tmpval.lower(
                            ) or 'none' in tmpval.lower():
                                values_list.append(None)
                            else:
                                tmpval = tmpval.replace("'", "")
                                tmpval = tmpval.replace('"', '')
                                values_list.append(float(tmpval))
                        else:
                            raise ValueError(
                                "invalid type in input file: {}".format(
                                    tmpval))
                    values.append(values_list)

        twoD_matrix = {
            'row_ids': row_names,
            'col_ids': col_names,
            'values': values
        }

        return twoD_matrix

    @staticmethod
    def get_obj_name(obj_name, condition1, condition2):
        def sanitize(ws_name):
            # I'm not using translate because it's a mess with mixed unicode & strings
            return ws_name.replace("\t", " ").replace(" ",
                                                      "_").replace("/", "|")

        return "{}-{}-{}".format(obj_name, sanitize(condition1),
                                 sanitize(condition2))

    def gen_diffexpr_matrices(self, params):

        print('In GEN DEMs')
        self.params = params
        self.setup_data()
        diffexpr_filepath = self.params.get('diffexpr_filepath')

        if 'deseq' in self.params.get('tool_used').lower():
            dem_ref = self.process_deseq_file(diffexpr_filepath)
        elif 'ballgown' in self.params.get('tool_used').lower():
            dem_ref = self.process_ballgown_file(diffexpr_filepath)
        elif 'cuffdiff' in self.params.get('tool_used').lower():
            dem_ref = self.process_cuffdiff_file(diffexpr_filepath)
        else:
            raise ValueError('"{}" is not a valid tool_used parameter'.format(
                self.params.get('tool_used')))
        return dem_ref

    def save_diffexpr_matrices(self, params):

        print('In SAVE DEMs')
        self.params = params
        self.setup_data()

        set_items = list()
        for deFile in self.params.get('diffexpr_data'):
            condition_mapping = deFile.get('condition_mapping')
            diffexpr_filepath = deFile.get('diffexpr_filepath')

            if deFile.get('delimter', None) is not None:
                delimiter = deFile.get('delimter')
            else:
                delimiter = '\t'
                fileext = os.path.splitext(diffexpr_filepath)[1]

                if 'csv' in fileext.lower():
                    delimiter = ','
                elif 'tsv' in fileext.lower():
                    delimiter = '\t'
                else:
                    print('Using tab delimiter')

            data_matrix = self.save_matrix(self.params.get('genome_ref'),
                                           diffexpr_filepath,
                                           self.new_col_names, delimiter)

            condition1, condition2 = condition_mapping.items()[0]
            object_name = self.get_obj_name(self.params['obj_name'],
                                            condition1, condition2)
            dem_ref = self.save_diff_expr_matrix(object_name, data_matrix,
                                                 condition1, condition2)
            set_items.append({
                'label': condition1 + ', ' + condition2,
                'ref': dem_ref
            })

        matrix_set = {
            'description': self.params.get('tool_used') +
            ' Differential Expression Matrix Set',
            'items': set_items
        }
        return self.save_diff_expr_matrix_set(self.params.get('obj_name'),
                                              matrix_set)
    def save_read_set(self, ctx, params):
        """
        :param params: instance of type "save_read_set_params" (** **  Method
           for adding Reads objects to a Reads Set) -> structure: parameter
           "workspace_name" of String, parameter "output_readset_name" of
           String, parameter "input_reads_list" of list of String, parameter
           "desc" of String
        :returns: instance of type "save_read_set_output" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN save_read_set

        console = []
        invalid_msgs = []
        #self.log(console,'Running save_read_set with params=')
        #self.log(console, "\n"+pformat(params))
        report = ''
        #        report = 'Running KButil_Add_Genomes_to_GenomeSet with params='
        #        report += "\n"+pformat(params)

        #### do some basic checks
        #
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'desc' not in params:
            raise ValueError('desc parameter is required')
        if 'input_reads_list' not in params:
            raise ValueError('input_reads_list parameter is required')
        #if 'input_readsset_name' not in params:
        #    raise ValueError('input_readsset_name parameter is optional')
        if 'output_readset_name' not in params:
            raise ValueError('output_readset_name parameter is required')

        # Build ReadsSet
        #
        elements = dict()

        savereadssetparams = {}
        savereadssetparams['workspace_name'] = params['workspace_name']
        savereadssetparams['output_object_name'] = params[
            'output_readset_name']
        readsetdata = {}
        if (params['desc'] is not None):
            readsetdata['description'] = params['desc']
        readsetdata['items'] = []

        print "WS " + params['workspace_name']
        print "READS " + str(params['input_reads_list'])
        # add new reads
        for reads_name in params['input_reads_list']:
            readssetitem = {}
            readssetitem['ref'] = params['workspace_name'] + '/' + reads_name
            readssetitem['label'] = ''
            readsetdata['items'].append(readssetitem)

        savereadssetparams['data'] = readsetdata

        # load the method provenance from the context object
        #
        #self.log(console,"Setting Provenance")  # DEBUG
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        try:
            prov_defined = provenance[0]['input_ws_objects']
        except:
            provenance[0]['input_ws_objects'] = []
        for reads_name in params['input_reads_list']:
            provenance[0]['input_ws_objects'].append(params['workspace_name'] +
                                                     '/' + reads_name)
        provenance[0]['service'] = 'ReadssetEditor'
        provenance[0]['method'] = 'save_read_set'

        # Save output object
        #
        #if len(invalid_msgs) == 0:
        #    self.log(console,"Saving ReadssSet")

        set_api = SetAPI(url=self.servicewizardURL, token=ctx['token'])
        #set_api._service_ver = "dev"
        set_api.save_reads_set_v1(savereadssetparams)

        # build output report object
        #
        #self.log(console,"BUILDING REPORT")  # DEBUG
        if len(invalid_msgs) == 0:
            #self.log(console,"reads in output set "+params['output_readset_name']+": "+str(len(elements.keys())))
            report += 'reads in output set ' + params[
                'output_readset_name'] + ': ' + str(len(
                    elements.keys())) + "\n"
            reportObj = {
                'objects_created': [{
                    'ref':
                    params['workspace_name'] + '/' +
                    params['output_readset_name'],
                    'description':
                    'save_read_set'
                }],
                'text_message':
                report
            }
        else:
            report += "FAILURE:\n\n" + "\n".join(invalid_msgs) + "\n"
            reportObj = {'objects_created': [], 'text_message': report}
        reportName = 'save_read_set_report_' + str(hex(uuid.getnode()))
        ws = workspaceService(self.workspaceURL, token=ctx['token'])
        report_obj_info = ws.save_objects({
            'workspace':
            params['workspace_name'],
            'objects': [{
                'type': 'KBaseReport.Report',
                'data': reportObj,
                'name': reportName,
                'meta': {},
                'hidden': 1,
                'provenance': provenance
            }]
        })[0]

        # Build report and return
        #
        #self.log(console,"BUILDING RETURN OBJECT")
        returnVal = {
            'report_name':
            reportName,
            'report_ref':
            str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' +
            str(report_obj_info[4]),
        }
        #self.log(console,"save_read_set DONE")

        #END save_read_set

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method save_read_set return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
示例#16
0
def fetch_reads_refs_from_sampleset(ref, ws_url, callback_url, params):
    """
    From the given object ref, return a list of all reads objects that are a part of that
    object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
    refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
    {
        "ref": reads object reference,
        "condition": condition string associated with that reads object
    }
    The only one required is "ref", all other keys may or may not be present, based on the reads
    object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
    for each reads object, but a single PairedEndLibrary may not have that info.
    If ref is already a Reads library, just returns a list with ref as a single element.
    """
    obj_type = get_object_type(ref, ws_url)
    ws = Workspace(ws_url)
    refs = list()
    refs_for_ws_info = list()
    if "KBaseSets.ReadsSet" in obj_type:
        print("Looking up reads references in ReadsSet object")
        set_client = SetAPI(callback_url)
        reads_set = set_client.get_reads_set_v1({
            "ref": ref,
            "include_item_info": 0
        })
        for reads in reads_set["data"]["items"]:
            refs.append({"ref": reads["ref"], "condition": reads["label"]})
            refs_for_ws_info.append({'ref': reads['ref']})
    elif "KBaseRNASeq.RNASeqSampleSet" in obj_type:
        print("Looking up reads references in RNASeqSampleSet object")
        sample_set = ws.get_objects2({"objects": [{
            "ref": ref
        }]})["data"][0]["data"]
        for i in range(len(sample_set["sample_ids"])):
            refs.append({
                "ref": sample_set["sample_ids"][i],
                "condition": sample_set["condition"][i]
            })
            refs_for_ws_info.append({'ref': sample_set['sample_ids'][i]})
    elif ("KBaseAssembly.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary" in obj_type
          or "KBaseFile.SingleEndLibrary-2.0" in obj_type
          or "KBaseFile.SingleEndLibrary-2.1" in obj_type
          or "KBaseAssembly.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary" in obj_type
          or "KBaseFile.PairedEndLibrary-2.0" in obj_type
          or "KBaseFile.PairedEndLibrary-2.1" in obj_type):
        refs.append({"ref": ref})
        refs_for_ws_info.append({'ref': ref})
    else:
        raise ValueError("Unable to fetch reads reference from object {} "
                         "which is a {}".format(ref, obj_type))

    # get object info so we can name things properly
    infos = ws.get_object_info3({'objects': refs_for_ws_info})['infos']

    name_ext = '_alignment'
    if ('alignment_suffix' in params
            and params['alignment_suffix'] is not None):
        ext = params['alignment_suffix'].replace(' ', '')
        if ext:
            name_ext = ext

    unique_names = get_unique_names(infos)
    for k in range(0, len(refs)):
        refs[k]['info'] = infos[k]
        name = unique_names[k] + name_ext
        refs[k]['alignment_output_name'] = name

    return refs
示例#17
0
class MetagenomeFileUtils:
    def _validate_merge_bins_from_binned_contig_params(self, params):
        """
        _validate_merge_bins_from_binned_contig_params:
                validates params passed to merge_bins_from_binned_contig method

        """
        log('Start validating merge_bins_from_binned_contig params')

        # check for required parameters
        for p in [
                'old_binned_contig_ref', 'bin_merges',
                'output_binned_contig_name', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        bin_merges = params.get('bin_merges')

        if not isinstance(bin_merges, list):
            error_msg = 'expecting a list for bin_merges param, '
            error_msg += 'but getting a [{}]'.format(type(bin_merges))
            raise ValueError(error_msg)

        for bin_merge in bin_merges:
            for p in ['new_bin_id', 'bin_to_merge']:
                if p not in bin_merge:
                    raise ValueError(
                        '"{}" key is required in bin_merges, but missing'.
                        format(p))

            bin_to_merge = bin_merge.get('bin_to_merge')

            if not isinstance(bin_to_merge, list):
                error_msg = 'expecting a list for bin_to_merge, '
                error_msg += 'but getting a [{}]'.format(type(bin_to_merge))
                raise ValueError(error_msg)

    def _validate_remove_bins_from_binned_contig_params(self, params):
        """
        _validate_remove_bins_from_binned_contig_params:
                validates params passed to remove_bins_from_binned_contig method

        """
        log('Start validating remove_bins_from_binned_contig params')

        # check for required parameters
        for p in [
                'old_binned_contig_ref', 'bins_to_remove',
                'output_binned_contig_name', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        bins_to_remove = params.get('bins_to_remove')

        if not isinstance(bins_to_remove, list):
            error_msg = 'expecting a list for bins_to_remove param, '
            error_msg += 'but getting a [{}]'.format(type(bins_to_remove))
            raise ValueError(error_msg)

    def _validate_file_to_binned_contigs_params(self, params):
        """
        _validate_file_to_binned_contigs_params:
                validates params passed to file_to_binned_contigs method

        """
        log('Start validating file_to_binned_contigs params')

        # check for required parameters
        for p in [
                'assembly_ref', 'file_directory', 'binned_contig_name',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_binned_contigs_to_file_params(self, params):
        """
        _validate_binned_contigs_to_file_params:
                validates params passed to binned_contigs_to_file method

        """

        log('Start validating binned_contigs_to_file params')

        # check for required parameters
        for p in ['input_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_extract_binned_contigs_as_assembly_params(self, params):
        """
        _validate_extract_binned_contigs_as_assembly_params:
                validates params passed to extract_binned_contigs_as_assembly method

        """

        log('Start validating extract_binned_contigs_as_assembly params')

        # check for required parameters
        for p in [
                'binned_contig_obj_ref', 'extracted_assemblies',
                'assembly_suffix', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # convert comma-separated list of bins into a list of individual ids (the python
        # comprehension construction deals with the fact that split(',') returns a list of
        # length one, [''], for an empty string input

        extracted_assemblies = [
            x for x in params.get('extracted_assemblies').split(',') if x
        ]

        # parameter assembly_set_name is required if extracted_assemblies list has more
        # than one element

        if len(extracted_assemblies) > 1 and 'assembly_set_name' not in params:
            raise ValueError(
                '"assembly_set_names" parameter is required for more than one extracted assembly'
            )

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _get_bin_ids(self, file_directory):
        """
        _get_bin_ids: getting bin contig ids from files

        NOTE: This method is very specific to MaxBin2 app result.
              Bin contig files generated by MaxBin2 follow 'header.0xx.fasta' name pattern
        """

        bin_ids = []

        result_files = os.listdir(file_directory)

        for file in result_files:
            if re.match(r'.*\.\d{3}\.fasta', file):
                bin_ids.append(file)

        log('generated bin ids:\n{}'.format('\n'.join(bin_ids)))

        return bin_ids

    def _process_summary_file(self, bin_id, lines):
        """
        _process_summary_file: process header.summary file content
                               getting GC content (gc), Genome size (sum_contig_len)
                               and Completeness (cov) from header.summary file

        NOTE: This method is very specific to MaxBin2 app result.

        header.summary file could be one of below fomat:
        Bin name                  Abundance  Completeness    Genome size     GC content
        maxbin_output.001.fasta   0.00       97.2%           2690533         52.9

        Bin name                  Completeness    Genome size     GC content
        maxbin_output.001.fasta   97.2%           2690533         52.9
        """

        for line in lines:
            line_list = line.split('\t')
            if line_list[0] == bin_id:
                if len(line_list) == 5:
                    gc = round(float(line_list[4]) / 100, 5)
                    sum_contig_len = int(line_list[3])
                    cov = round(float(line_list[2].partition('%')[0]) / 100, 5)
                elif len(line_list) == 4:
                    gc = round(float(line_list[3]) / 100, 5)
                    sum_contig_len = int(line_list[2])
                    cov = round(float(line_list[1].partition('%')[0]) / 100, 5)

        return gc, sum_contig_len, cov

    def _get_total_contig_len(self, file_directory):
        """
        _get_total_contig_len: process header.summary file content
                               getting total contig length from header.summary file

        NOTE: This method is very specific to MaxBin2 app result.
        """

        log('generating total contig length')
        total_contig_len = 0

        file_list = os.listdir(file_directory)
        for file in file_list:
            if file.endswith('.summary'):
                with open(os.path.join(file_directory, file),
                          'r') as summary_file:
                    lines = summary_file.readlines()
                    for line in lines[1:]:
                        line_list = line.split('\t')
                        if len(line_list) == 5:
                            total_contig_len += int(line_list[3])
                        elif len(line_list) == 4:
                            total_contig_len += int(line_list[2])

        log('generated total contig length: {}'.format(total_contig_len))
        return total_contig_len

    def _generate_contig_bin_summary(self, bin_id, file_directory):
        """
        _generate_contig_bin_summary: getting ContigBin summary from header.summary file

        NOTE: This method is very specific to MaxBin2 app result.
        """
        log('generating summary for bin_id: {}'.format(bin_id))

        file_list = os.listdir(file_directory)

        for file in file_list:
            if file.endswith('.summary'):
                with open(os.path.join(file_directory, file),
                          'r') as summary_file:
                    lines = summary_file.readlines()
                    gc, sum_contig_len, cov = self._process_summary_file(
                        bin_id, lines)

        log('generated GC content: {}, Genome size: {} '.format(
            gc, sum_contig_len))
        log('and Completeness: {} for bin_id: {}'.format(cov, bin_id))
        return gc, sum_contig_len, cov

    def _generate_contigs(self, file_name, file_directory, assembly_ref):
        """
        _generate_contigs: generate contigs from assembly object

        file_name: file name of fasta file
        file_directory: fasta file directory
        assembly_ref: associated assembly object reference
        """

        log('start generating contig objects for file: {}'.format(file_name))

        assembly = self.dfu.get_objects({'object_refs':
                                         [assembly_ref]})['data'][0]
        assembly_contigs = assembly.get('data').get('contigs')

        contigs = {}
        for record in SeqIO.parse(os.path.join(file_directory, file_name),
                                  "fasta"):

            contig_id = record.id
            contig = assembly_contigs.get(contig_id)

            if contig:
                # using assembly object data
                contig_gc = contig.get('gc_content')
                sequence_length = contig.get('length')
            else:
                log('cannot find contig [{}] from assembly.'.format(contig_id))
                log('computing contig info')

                sequence = str(record.seq).upper()
                sequence_length = len(sequence)

                contig_gc_len = 0
                contig_gc_len += sequence.count('G')
                contig_gc_len += sequence.count('C')

                contig_gc = round(
                    float(contig_gc_len) / float(sequence_length), 5)

            contig = {'gc': contig_gc, 'len': sequence_length}
            contigs[contig_id] = contig

        log('complete generating contig objects for file: {}'.format(
            file_name))

        return contigs

    def _generate_contig_bin(self, bin_id, file_directory, assembly_ref):
        """
        _generate_contig_bin: gerneate ContigBin structure
        """
        log('start generating BinnedContig info for bin: {}'.format(bin_id))

        # generate ContigBin summery info
        gc, sum_contig_len, cov = self._generate_contig_bin_summary(
            bin_id, file_directory)

        # generate Contig info
        contigs = self._generate_contigs(bin_id, file_directory, assembly_ref)

        contig_bin = {
            'bid': bin_id,
            'contigs': contigs,
            'n_contigs': len(contigs),
            'gc': gc,
            'sum_contig_len': sum_contig_len,
            'cov': cov
        }

        log('complete generating BinnedContig info for bin: {}'.format(bin_id))

        return contig_bin

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contif file from GenomeAssembly object
        """

        log('retrieving contig file from assembly: {}'.format(assembly_ref))
        contig_file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref
        }).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path':
                                            contig_file})['file_path']

        log('saved contig file to: {}'.format(contig_file))

        return contig_file

    def _get_contig_string(self, contig_id, assembly_contig_file,
                           parsed_assembly):
        """
        _get_contig_string: find and return contig string from assembly contig file
        """

        # parsed_assembly = SeqIO.to_dict(SeqIO.parse(assembly_contig_file, "fasta"))

        contig_record = parsed_assembly.get(contig_id)

        if contig_record:
            string_contig = ''
            string_contig += '>{}\n'.format(contig_id)
            string_contig += str(contig_record.seq).upper()
            string_contig += '\n'
        else:
            error_msg = 'Cannot find contig [{}] from file [{}].'.format(
                contig_id, assembly_contig_file)
            raise ValueError(error_msg)

        return string_contig

    def _pack_file_to_shock(self, result_files):
        """
        _pack_file_to_shock: pack files in result_files list and save in shock
        """

        log('start packing and uploading files:\n{}'.format(
            '\n'.join(result_files)))

        output_directory = os.path.join(
            self.scratch, 'packed_binned_contig_' + str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(
            output_directory,
            'packed_binned_contig_' + str(uuid.uuid4()) + '.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for file in result_files:
                zip_file.write(file, os.path.basename(file))

        shock_id = self.dfu.file_to_shock({
            'file_path': result_file
        }).get('shock_id')

        log('saved file to shock: {}'.format(shock_id))

        return shock_id

    def _generate_report(self, report_message, params):
        """
        generate_report: generate summary report

        """
        log('Generating report')

        uuid_string = str(uuid.uuid4())
        upload_message = 'Job Finished\n\n'
        upload_message += report_message

        log('Report message:\n{}'.format(upload_message))

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'MetagenomeUtils_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_report_message(self, new_binned_contig_ref):
        """
        _generate_report_message: generate a report message for BinnedContig object
        """

        report_message = ''

        binned_contig = self.dfu.get_objects(
            {'object_refs': [new_binned_contig_ref]})['data'][0]
        binned_contig_info = binned_contig.get('info')
        binned_contig_name = binned_contig_info[1]
        report_message += 'Generated BinnedContig: {} [{}]\n'.format(
            binned_contig_name, new_binned_contig_ref)

        binned_contig_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        bin_ids = []
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))
            bin_ids.append(bin.get('bid'))

        report_message += '--------------------------\nSummary:\n\n'
        report_message += 'Binned contigs: {}\n'.format(binned_contig_count)
        report_message += 'Total size of bins: {}\n'.format(total_bins_count)
        report_message += 'Bin IDs:\n{}\n'.format('\n'.join(bin_ids))

        return report_message

    def _merge_bins(self, new_bin_id, bin_objects_to_merge):
        """
        _merge_bins: merge a list of bins into new_bin_id

        """
        total_contigs = {}
        total_gc_count = 0
        total_sum_contig_len = 0
        total_cov_len = 0

        for bin in bin_objects_to_merge:
            total_contigs.update(bin.get('contigs'))
            sum_contig_len = bin.get('sum_contig_len')
            total_sum_contig_len += sum_contig_len
            total_gc_count += sum_contig_len * bin.get('gc')
            total_cov_len += sum_contig_len * bin.get('cov')

        contig_bin = {
            'bid': new_bin_id,
            'contigs': total_contigs,
            'n_contigs': len(total_contigs),
            'gc': round(float(total_gc_count) / total_sum_contig_len, 5),
            'sum_contig_len': total_sum_contig_len,
            'cov': round(float(total_cov_len) / total_sum_contig_len, 5)
        }

        return contig_bin

    def _save_binned_contig(self, binned_contigs, workspace_name,
                            binned_contig_name):
        """
        _build_binned_contig: save BinnedContig object
        """

        workspace_name = workspace_name
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        object_type = 'KBaseMetagenomes.BinnedContigs'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': binned_contigs,
                'name': binned_contig_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        new_binned_contig_ref = str(dfu_oi[6]) + '/' + str(
            dfu_oi[0]) + '/' + str(dfu_oi[4])

        return new_binned_contig_ref

    def _check_bin_merges(self, bin_merges):
        """
        _check_bin_merges: checking bin_merges
        """
        bin_id_list = map(lambda item: item.get('bin_to_merge'), bin_merges)
        bin_ids = []
        map(lambda item: map(lambda bin_id: bin_ids.append(bin_id), item),
            bin_id_list)

        for bin_id in bin_id_list:
            if len(bin_id) <= 1:
                raise ValueError(
                    "Please provide at least two bin_ids to merge")
            for id in bin_id:
                if bin_ids.count(id) > 1:
                    raise ValueError(
                        "Same bin [{}] appears in muliple merges".format(id))

        new_bin_id_list = map(lambda item: item.get('new_bin_id'), bin_merges)
        for new_bin_id in new_bin_id_list:
            if new_bin_id_list.count(new_bin_id) > 1:
                raise ValueError(
                    "Same new Bin ID [{}] appears in muliple merges".format(
                        id))

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.setapi = SetAPI(self.callback_url)
        self.wss = workspaceService(config['workspace-url'])

    def file_to_binned_contigs(self, params):
        """
        file_to_binned_contigs: Generating BinnedContigs ojbect from files

        input params:
        file_directory: file directory containing compressed/unpacked contig file(s) to
                        build BinnedContig object
        assembly_ref: metagenome assembly object reference
        binned_contig_name: BinnedContig object name
        workspace_name: the name/id of the workspace it gets saved to

        return params:
        binned_contig_obj_ref: generated result BinnedContig object reference
        """

        log('--->\nrunning MetagenomeFileUtils.file_to_binned_contigs\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_file_to_binned_contigs_params(params)

        file_directory = params.get('file_directory')
        assembly_ref = params.get('assembly_ref')

        log('starting generating BinnedContig object')
        bin_ids = self._get_bin_ids(file_directory)

        bins = []
        for bin_id in bin_ids:
            contig_bin = self._generate_contig_bin(bin_id, file_directory,
                                                   assembly_ref)
            bins.append(contig_bin)
        log('finished generating BinnedContig object')

        total_contig_len = self._get_total_contig_len(file_directory)

        binned_contigs = {
            'assembly_ref': assembly_ref,
            'bins': bins,
            'total_contig_len': total_contig_len
        }

        binned_contig_obj_ref = self._save_binned_contig(
            binned_contigs, params.get('workspace_name'),
            params.get('binned_contig_name'))

        returnVal = {'binned_contig_obj_ref': binned_contig_obj_ref}
        log('successfully saved BinnedContig object')

        return returnVal

    def binned_contigs_to_file(self, params):
        """
        binned_contigs_to_file: Convert BinnedContig object to fasta files and pack them to shock

        input params:
        input_ref: BinnedContig object reference

        optional params:
        save_to_shock: saving result bin files to shock. default to True
        bin_id_list: only extract bin_id_list

        return params:
        shock_id: saved packed file shock id
        bin_file_directory: directory that contains all bin files
        """

        log('--->\nrunning MetagenomeFileUtils.binned_contigs_to_file\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_binned_contigs_to_file_params(params)

        binned_contig_object = self.dfu.get_objects(
            {'object_refs': [params.get('input_ref')]})['data'][0]

        assembly_ref = binned_contig_object.get('data').get('assembly_ref')
        assembly_contig_file = self._get_contig_file(assembly_ref)
        log('parsing assembly file [{}] to dictionary'.format(
            assembly_contig_file))
        parsed_assembly = SeqIO.to_dict(
            SeqIO.parse(assembly_contig_file, "fasta"))

        bins = binned_contig_object.get('data').get('bins')

        result_directory = os.path.join(
            self.scratch, 'binned_contig_files_' + str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        result_files = []
        bin_id_list = params.get('bin_id_list')
        for bin in bins:
            bin_id = bin.get('bid')
            if bin_id_list:
                if bin_id in bin_id_list:
                    log('processing bin: {}'.format(bin_id))
                    with open(os.path.join(result_directory, bin_id),
                              'w') as file:
                        contigs = bin.get('contigs')
                        for contig_id in contigs.keys():
                            contig_string = self._get_contig_string(
                                contig_id, assembly_contig_file,
                                parsed_assembly)
                            file.write(contig_string)
                    result_files.append(os.path.join(result_directory, bin_id))
                    log('saved contig file to: {}'.format(result_files[-1]))
            else:
                log('processing bin: {}'.format(bin_id))
                with open(os.path.join(result_directory, bin_id), 'w') as file:
                    contigs = bin.get('contigs')
                    for contig_id in contigs.keys():
                        contig_string = self._get_contig_string(
                            contig_id, assembly_contig_file, parsed_assembly)
                        file.write(contig_string)
                result_files.append(os.path.join(result_directory, bin_id))
                log('saved contig file to: {}'.format(result_files[-1]))

        if params.get('save_to_shock') or params.get('save_to_shock') is None:
            shock_id = self._pack_file_to_shock(result_files)
        else:
            shock_id = None

        returnVal = {
            'shock_id': shock_id,
            'bin_file_directory': result_directory
        }

        return returnVal

    def _get_object_name_from_ref(self, obj_ref):
        """given the object reference, return the object_name as a string"""
        return (self.wss.get_object_info_new({"objects": [{
            'ref': obj_ref
        }]})[0][1])

    def extract_binned_contigs_as_assembly(self, params):
        """
        extract_binned_contigs_as_assembly: extract one/multiple Bins from BinnedContigs as
                                            Assembly

        input params:
        binned_contig_obj_ref: BinnedContig object reference
        extracted_assemblies: a string, a comma-separated list of bin_ids to be extracted
        workspace_name: the name of the workspace it gets saved to

        return params:
        assembly_ref_list: a list of generated result Assembly object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning MetagenomeFileUtils.extract_binned_contigs_as_assembly\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_extract_binned_contigs_as_assembly_params(params)

        # convert comma-separated list of bins into a list of individual ids (the python
        # comprehension construction deals with the fact that split(',') returns a list of
        # length one, [''], for an empty string input

        extracted_assemblies = [
            x for x in params.get('extracted_assemblies').split(',') if x
        ]

        binned_contig_obj_ref = params.get('binned_contig_obj_ref')
        contigs_to_file_ret = self.binned_contigs_to_file({
            'input_ref':
            binned_contig_obj_ref,
            'save_to_shock':
            False,
            'bin_id_list':
            extracted_assemblies
        })

        bin_file_directory = contigs_to_file_ret.get('bin_file_directory')
        # bin_files will be either a list of the bin contig files corresponding to the
        # target bin ids, or a list of all bin contig files if extracted_assemblies is empty
        bin_files = os.listdir(bin_file_directory)

        # if extracted_assemblies is empty list, create a full one here
        if not extracted_assemblies:
            extracted_assemblies = bin_files
            log("extracted_assemblies was empty, is now " +
                pformat(extracted_assemblies))

        generated_assembly_ref_list = []
        assembly_suffix = params.get('assembly_suffix').strip()
        for bin_id in extracted_assemblies:
            if bin_id not in map(os.path.basename, bin_files):
                error_msg = 'bin_id [{}] cannot be found in BinnedContig '.format(
                    bin_id)
                error_msg += '[{}]'.format(binned_contig_obj_ref)
                raise ValueError(error_msg)
            else:
                output_assembly_name = bin_id + assembly_suffix
                log('saving assembly: {}'.format(output_assembly_name))
                for bin_file in bin_files:
                    if os.path.basename(bin_file) == bin_id:
                        log('starting generating assembly from {}'.format(
                            bin_id))
                        assembly_params = {
                            'file': {
                                'path': os.path.join(bin_file_directory,
                                                     bin_file)
                            },
                            'workspace_name': params.get('workspace_name'),
                            'assembly_name': output_assembly_name
                        }
                        assembly_ref = self.au.save_assembly_from_fasta(
                            assembly_params)
                        log('finished generating assembly from {}'.format(
                            bin_id))
                        generated_assembly_ref_list.append(assembly_ref)
        setref = None
        if (len(generated_assembly_ref_list) > 1):
            binned_contig_object_name = self._get_object_name_from_ref(
                binned_contig_obj_ref)
            assembly_set_name = params.get('assembly_set_name')
            log("saving assembly set {0}".format(assembly_set_name))
            setref = self.setapi.save_assembly_set_v1({
                'workspace':
                params.get('workspace_name'),
                'output_object_name':
                assembly_set_name,
                'data': {
                    'description':
                    'binned assemblies from {0}'.format(
                        binned_contig_object_name),
                    'items': [{
                        'ref': r
                    } for r in generated_assembly_ref_list]
                }
            })
            log("save assembly set_ref is {0}".format(setref.get('set_ref')))

        report_message = 'Generated Assembly Reference: {}'.format(
            ', '.join(generated_assembly_ref_list))

        reportVal = self._generate_report(report_message, params)

        returnVal = {'assembly_ref_list': generated_assembly_ref_list}
        returnVal.update(reportVal)

        if setref:
            returnVal.update({'assembly_set_ref': setref})

        return returnVal

    def remove_bins_from_binned_contig(self, params):
        """
        remove_bins_from_binned_contig: remove a list of bins from BinnedContig object

        input params:
        old_binned_contig_ref: Original BinnedContig object reference
        bins_to_remove: a list of bin ids to be removed
        output_binned_contig_name: Name for the output BinnedContigs object
        workspace_name: the name of the workspace new object gets saved to

        return params:
        new_binned_contig_ref: newly created BinnedContig object referece
        """

        log('--->\nrunning MetagenomeFileUtils.remove_bins_from_binned_contig\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_remove_bins_from_binned_contig_params(params)

        binned_contig_object = self.dfu.get_objects(
            {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0]

        assembly_ref = binned_contig_object.get('data').get('assembly_ref')
        total_contig_len = int(
            binned_contig_object.get('data').get('total_contig_len'))

        old_bins = binned_contig_object.get('data').get('bins')
        bins_to_remove = params.get('bins_to_remove')

        for bin in list(old_bins):
            bin_id = bin.get('bid')
            if bin_id in bins_to_remove:
                log('removing bin_id: {}'.format(bin_id))
                old_bins.remove(bin)
                total_contig_len -= int(bin.get('sum_contig_len'))
                log('removed bin_id: {} from BinnedContig object'.format(
                    bin_id))

        binned_contigs = {
            'assembly_ref': assembly_ref,
            'bins': old_bins,
            'total_contig_len': total_contig_len
        }

        new_binned_contig_ref = self._save_binned_contig(
            binned_contigs, params.get('workspace_name'),
            params.get('output_binned_contig_name'))

        returnVal = {'new_binned_contig_ref': new_binned_contig_ref}
        log('successfully saved BinnedContig object')

        return returnVal

    def merge_bins_from_binned_contig(self, params):
        """
        merge_bins_from_binned_contig: merge a list of bins from BinnedContig object

        input params:
        old_binned_contig_ref: Original BinnedContig object reference
        bin_merges: a list of bin merges dicts
            new_bin_id: newly created bin id
            bin_to_merge: list of bins to merge
        output_binned_contig_name: Name for the output BinnedContigs object
        workspace_name: the name of the workspace new object gets saved to

        return params:
        new_binned_contig_ref: newly created BinnedContig object referece
        """

        log('--->\nrunning MetagenomeFileUtils.merge_bins_from_binned_contig\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_merge_bins_from_binned_contig_params(params)

        bin_merges = params.get('bin_merges')
        self._check_bin_merges(bin_merges)

        binned_contig_object = self.dfu.get_objects(
            {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0]

        assembly_ref = binned_contig_object.get('data').get('assembly_ref')
        total_contig_len = int(
            binned_contig_object.get('data').get('total_contig_len'))

        bins = binned_contig_object.get('data').get('bins')
        old_bin_ids = map(lambda item: item.get('bid'), bins)

        for bin_merge in bin_merges:
            new_bin_id = bin_merge.get('new_bin_id')
            bin_id_to_merge = bin_merge.get('bin_to_merge')
            if set(bin_id_to_merge) <= set(old_bin_ids):
                bin_objects_to_merge = []
                for bin in list(bins):
                    bin_id = bin.get('bid')
                    if bin_id in bin_id_to_merge:
                        bin_objects_to_merge.append(bin)
                        log('removing bin_id: {}'.format(bin_id))
                        bins.remove(bin)
                        total_contig_len -= int(bin.get('sum_contig_len'))
                        log('removed bin_id: {} from BinnedContig object'.
                            format(bin_id))
                new_bin = self._merge_bins(new_bin_id, bin_objects_to_merge)
                log('appending bin_id: {}'.format(new_bin_id))
                bins.append(new_bin)
                total_contig_len += int(new_bin.get('sum_contig_len'))
                log('appended bin_id: {} to BinnedContig object'.format(
                    new_bin_id))
            else:
                bad_bin_ids = list(set(bin_id_to_merge) - set(old_bin_ids))
                error_msg = 'bin_id: [{}] '.format(', '.join(bad_bin_ids))
                error_msg += 'is not listed in BinnedContig object'
                raise ValueError(error_msg)

        binned_contigs = {
            'assembly_ref': assembly_ref,
            'bins': bins,
            'total_contig_len': total_contig_len
        }

        new_binned_contig_ref = self._save_binned_contig(
            binned_contigs, params.get('workspace_name'),
            params.get('output_binned_contig_name'))

        returnVal = {'new_binned_contig_ref': new_binned_contig_ref}
        log('successfully saved BinnedContig object')

        return returnVal

    def edit_bins_from_binned_contig(self, params):
        """
        edit_bins_from_binned_contig: merge/remove a list of bins from BinnedContig object
                                    a wrapper method of:
                                    merge_bins_from_binned_contig
                                    remove_bins_from_binned_contig


        input params:
        old_binned_contig_ref: Original BinnedContig object reference
        bins_to_remove: a list of bin ids to be removed
        bin_merges: a list of bin merges dicts
            new_bin_id: newly created bin id
            bin_to_merge: list of bins to merge
        output_binned_contig_name: Name for the output BinnedContigs object
        workspace_name: the name of the workspace new object gets saved to

        return params:
        new_binned_contig_ref: newly created BinnedContig object referece
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning MetagenomeFileUtils.edit_bins_from_binned_contig\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        input_params = params.copy()
        if params.get('bins_to_remove'):
            bins_to_remove = input_params.get('bins_to_remove')
            if isinstance(bins_to_remove, string_types):
                input_params['bins_to_remove'] = bins_to_remove.split(',')
            new_binned_contig_ref = self.remove_bins_from_binned_contig(
                input_params).get('new_binned_contig_ref')
            input_params['old_binned_contig_ref'] = new_binned_contig_ref

        if params.get('bin_merges'):
            new_binned_contig_ref = self.merge_bins_from_binned_contig(
                input_params).get('new_binned_contig_ref')

        returnVal = {'new_binned_contig_ref': new_binned_contig_ref}

        report_message = self._generate_report_message(new_binned_contig_ref)
        reportVal = self._generate_report(report_message, params)
        returnVal.update(reportVal)

        return returnVal
示例#18
0
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
示例#19
0
class EditAlignmentSet:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME_ID = 'workspace_name'
    PARAM_IN_OBJ_NAME_ID = 'output_object_name'
    PARAM_IN_ALIGNSET_REF = 'alignment_set_ref'
    PARAM_IN_ALIGNS_ADD = 'alignments_to_add'
    PARAM_IN_ALIGNS_RM = 'alignments_to_remove'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'EAS_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.setAPI = SetAPI(self.callback_url)
        pass

    def _process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [
                self.PARAM_IN_ALIGNSET_REF, self.PARAM_IN_OBJ_NAME_ID,
                self.PARAM_IN_WS_NAME_ID
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        ws_name_id = params.get(self.PARAM_IN_WS_NAME_ID)
        if not isinstance(ws_name_id, int):
            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD)
        alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM)

        if alignments_to_add is None and alignments_to_remove is None:
            raise ValueError(
                'Either "alignments_to_remove" or "alignments_to_add" should be given'
            )

        return ws_name_id

    def _get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def _get_obj_info(self, ref):
        return self.ws_client.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]

    def _get_set_items(self, alignment_set_ref):

        obj_info = self._get_obj_info(alignment_set_ref)
        obj_type = self._get_type_from_obj_info(obj_info)

        if obj_type in ['KBaseSets.ReadsAlignmentSet']:
            set_data = self.setAPI.get_reads_alignment_set_v1(
                {'ref': alignment_set_ref})
            items = set_data['data']['items']
        elif obj_type in ['KBaseRNASeq.RNASeqAlignmentSet']:
            alignmentset_obj = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': alignment_set_ref
                }]})['data'][0]
            """
            Add each alignment object to align_item and add it to items list
            """
            items = list()
            for alignment_ref in alignmentset_obj['data']['sample_alignments']:
                align_item = dict()
                align_item['ref'] = alignment_ref
                items.append(align_item)
        else:
            raise ValueError(
                '"alignment_set_ref" should be of type KBaseSets.ReadsAlignmentSet or '
                + 'KBaseRNASeq.RNASeqAlignmentSet')

        return items

    def _add_alignments(self, alignment_set_items, alignment_refs_list):

        for alignment_ref in alignment_refs_list:

            found = False
            for set_item in alignment_set_items:
                if set_item.get('ref') == alignment_ref:
                    print('{} already in the input Alignment Set. Not added'.
                          format(alignment_ref))
                    found = True
                    break

            if not found:
                alignment_set_items.append({'ref': alignment_ref})
        return alignment_set_items

    def _remove_alignments(self, input_alignment_set, alignment_set_items,
                           alignments_to_remove):

        for input_item in input_alignment_set:
            if not (input_item.get('ref') in alignments_to_remove):
                alignment_set_items.append(input_item)

        return alignment_set_items

    def _save_alignment_set(self, ws_name, obj_name, set_data):

        res = self.setAPI.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": obj_name,
            "data": set_data
        })
        return res.get('set_ref')

    def edit_alignment_set(self, params):

        ws_name_id = self._process_params(params)
        obj_name = params.get(self.PARAM_IN_OBJ_NAME_ID)

        alignment_set_ref = params.get(self.PARAM_IN_ALIGNSET_REF)

        print('INPUT ALIGNMENT SET REF: ' + alignment_set_ref)

        input_alignment_set = self._get_set_items(alignment_set_ref)

        alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM, None)
        alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD, None)

        set_items = list()
        if alignments_to_remove is not None:
            set_items = self._remove_alignments(input_alignment_set, set_items,
                                                alignments_to_remove)
        if alignments_to_add is not None:
            set_items = self._add_alignments(set_items, alignments_to_add)

        set_data = {
            'description': 'Edited from {}'.format(alignment_set_ref),
            'items': set_items
        }

        output_alignment_set_ref = self._save_alignment_set(
            ws_name_id, obj_name, set_data)
        return output_alignment_set_ref