Exemplo n.º 1
0
    def export_original_genbank(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) get genome genbank handle reference
        getGenomeOptions = {
            'genomes': [{
                'ref': params['genome_ref']
            }],
            'included_fields': ['genbank_handle_ref'],
            'ignore_errors': 0  # if we can't find the genome, throw an error
        }
        if 'ref_path_to_genome' in params:
            getGenomeOptions['genomes'][0]['ref_path_to_genome'] = params[
                'ref_path_to_genome']

        api = GenomeAnnotationAPI(self.cfg.callbackURL)
        genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0]
        info = genome_data['info']
        data = genome_data['data']

        # 3) make sure the type is valid
        if info[2].split('-')[0] != 'KBaseGenomes.Genome':
            raise ValueError('Object is not a Genome, it is a:' + str(info[2]))

        # 4) if the genbank handle is there, get it and return
        print('checking if genbank file is cached...')
        result = self.get_genbank_handle(data)
        return result
Exemplo n.º 2
0
    def run_vcontact(self, ctx, params):
        """
        :param params: instance of type "InParams" -> structure: parameter
           "genome" of type "obj_ref" (Insert your typespec information here.)
        """
        # ctx is the context object
        #BEGIN run_vcontact
        self.callback_url = os.environ['SDK_CALLBACK_URL']

        vc = vConTACTUtils(self.config)

        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        genome = params['genome']
        genome_data = self.genome_api.get_genome_v1(
            {"genomes": [{
                "ref": genome
            }]})

        gene2genome, sequences = vc.genome_to_inputs(genome_data)

        gene2genome_fp, sequences_fp = vc.write_inputs(gene2genome, sequences)

        params['gene2genome'] = gene2genome_fp
        params['sequences'] = sequences_fp

        returnVal = vc.run_vcontact(params)

        vc.vcontact_help()

        kbo = KBObjectUtils(self.config)
        kbo.create_report(params['workspace_name'])

        #END run_vcontact
        pass
Exemplo n.º 3
0
    def export(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) get genome genbank handle reference
        getGenomeOptions = {
            'genomes': [{
                'ref': params['genome_ref']
            }],
            'included_fields': ['genbank_handle_ref'],
            'ignore_errors': 0  # if we can't find the genome, throw an error
        }
        if 'ref_path_to_genome' in params:
            getGenomeOptions['genomes'][0]['ref_path_to_genome'] = params[
                'ref_path_to_genome']

        api = GenomeAnnotationAPI(self.cfg.callbackURL)
        genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0]
        info = genome_data['info']
        data = genome_data['data']

        # 3) make sure the type is valid
        if info[2].split('-')[0] != 'KBaseGenomes.Genome':
            raise ValueError('Object is not a Genome, it is a:' + str(info[2]))

        # 4) build the genbank file and return it
        print('not cached, building file...')
        result = self.build_genbank_file(getGenomeOptions,
                                         "KBase_derived_" + info[1] + ".gbff")
        if result is None:
            raise ValueError('Unable to generate file.  Something went wrong')
        result['from_cache'] = 0
        return result
Exemplo n.º 4
0
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     # Add a genome to the WS to test ref_paths
     genome_name = "Genome.1"
     genome = {'id': 'Unknown', 'features': [],
               'scientific_name': "",
               'domain': "", 'genetic_code': 0,
               'assembly_ref': assembly_ref,
               'cdss': [], 'mrnas': [],
               'source': 'Magic!',
               'gc_content': 0, 'dna_size': 0,
               'reference_annotation': 0}
     prov = self.getContext().provenance()
     ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
     info = ga.save_one_genome_v1(
         {'workspace': self.getWsName(), 'name': genome_name,
          'data': genome, 'provenance': prov})['info']
     genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': "{};{}".format(genome_ref, assembly_ref),
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': 0,
                                               'genus': 'genus',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
     genome_ref = self.getWsName() + "/" + genome_name
     genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data']
     features_to_work = {}
     for feature in genome['features']:
         features_to_work[feature['id']] = feature['location']
     aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 
                                             'assembly_ref': genome['assembly_ref']})['dna_sequences']
     bad_dnas = 0
     for feature in genome['features']:
         if feature['dna_sequence'] != dna_sequences[feature['id']]:
             bad_dnas += 1
     self.assertEqual(bad_dnas, 0)
Exemplo n.º 5
0
 def __init__(self, config):
     self.scratch = os.path.abspath(config['scratch'])
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.token = os.environ['KB_AUTH_TOKEN']
     self.scratch = os.path.abspath(config['scratch'])
     self.ws = Workspace(config['workspace-url'], token=self.token)
     self.genome_api = GenomeAnnotationAPI(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
Exemplo n.º 6
0
 def load_genome_features_prepare_fasta(self, genome_refs,
                                        compliant_fasta_dir):
     feature_info = {}
     os.makedirs(compliant_fasta_dir)
     for genome_pos, genome_ref in enumerate(genome_refs):
         ############################# Genome loading ##########################
         self.log_line("Loading Genome object from workspace for ref [" +
                       genome_ref + "]")
         info = self.ws.get_object_info_new(
             {"objects": [{
                 "ref": genome_ref
             }]})[0]
         genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
         gaapi = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'],
                                     token=self.token)
         genome = gaapi.get_genome_v1({
             "genomes": [{
                 "ref": genome_ref
             }],
             "included_fields": ["scientific_name"],
             "included_feature_fields":
             ["id", "protein_translation", "type", "function"]
         })["genomes"][0]["data"]
         ############################# Features + Fasta ##########################
         self.log_line("Preparing fasta file for ref [" + genome_ref + "]")
         genome_id = str(genome_pos + 1)
         records = []
         for feature_pos, feature in enumerate(genome["features"]):
             feature_id = feature["id"]
             sequence = feature.get("protein_translation")
             if sequence:
                 id = str(feature_pos + 1)
                 record = SeqRecord(Seq(sequence), id=id, description="")
                 records.append(record)
                 func = feature.get("function")
                 feature_info[genome_id + "|" + id] = {
                     "fid": feature_id,
                     "fpos": feature_pos,
                     "gref": genome_ref,
                     "func": func
                 }
         fasta_file = self.scratch + "/" + genome_id + ".fasta"
         SeqIO.write(records, fasta_file, "fasta")
         ############################# Adjusting Fasta by Orthomcl ##########################
         self.log_line("Running orthomclAdjustFasta for ref [" +
                       genome_ref + "]")
         self.log_process(
             subprocess.Popen([
                 "perl", self.plbin + "/orthomclAdjustFasta", genome_id,
                 fasta_file, "1"
             ],
                              cwd=compliant_fasta_dir,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE))
     return feature_info
Exemplo n.º 7
0
    def export(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) get genome gff handle reference
        getGenomeOptions = {
            'genomes': [{
                'ref': params['genome_ref']
            }],
            'included_fields': ['gff_handle_ref'],
            'ignore_errors': 0  # if we can't find the genome, throw an error
        }
        if 'ref_path_to_genome' in params:
            getGenomeOptions['genomes'][0]['ref_path_to_genome'] = params[
                'ref_path_to_genome']

        api = GenomeAnnotationAPI(self.cfg.callbackURL)
        genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0]
        info = genome_data['info']
        data = genome_data['data']

        # 3) make sure the type is valid
        if info[2].split('-')[0] != 'KBaseGenomes.Genome':
            raise ValueError('Object is not a Genome, it is a:' + str(info[2]))

        is_gtf = params.get('is_gtf', 0)

        target_dir = params.get('target_dir')
        if not target_dir:
            target_dir = os.path.join(self.cfg.sharedFolder,
                                      "gff_" + str(int(time.time() * 1000)))
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)

        # 4) if the GFF handle is there, get it and return
        if is_gtf != 1:
            print('checking if GFF file is cached...')
            result = self.get_gff_handle(data, target_dir)
            if result is not None:
                result['from_cache'] = 1
                return result
            print('not cached, building file...')

        # 5) otherwise, build the GFF file and return it
        result = self.build_gff_file(getGenomeOptions, target_dir, info[1],
                                     is_gtf == 1)
        if result is None:
            raise ValueError('Unable to generate file.  Something went wrong')
        result['from_cache'] = 0
        return result
Exemplo n.º 8
0
 def annotate_genes(self, ctx, params):
     """
     :param params: instance of type "AnnotateGenesParams" -> structure:
        parameter "input_genome_ref" of String, parameter
        "output_workspace" of String, parameter "output_genome_name" of
        String
     """
     # ctx is the context object
     #BEGIN annotate_genes
     ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'], token=ctx['token'])
     genome = ga.get_genome_v1({"genomes": [{"ref": params['input_genome_ref']}]}
                               )["genomes"][0]["data"]
     records = []
     for feature_index, feature in enumerate(genome["features"]):
         feature_id = feature["id"]
         sequence = feature.get("protein_translation")
         if sequence:
             record = SeqRecord(Seq(sequence), id=feature_id, description="")
             records.append(record)
     fasta_file = self.scratch + "/proteins.faa"
     SeqIO.write(records, fasta_file, "fasta")
     output_file = self.scratch + '/output.txt'
     with open(fasta_file, "r") as infile:
         with open(output_file, "w") as outfile:
             p = subprocess.Popen("kmer_search -m 5 -g 200 -d /data/kmer/V2Data -a",
                                  shell=True, cwd=self.scratch, stdin=infile, 
                                  stdout=outfile, stderr=sys.stderr.fileno())
             p.wait()
     fid_to_finc = {}
     with open(output_file, "r") as infile:
         for line in infile:
             parts = line.rstrip().split("\t")
             fid = parts[0]
             func = parts[1]
             print("Function prediction for feature id=" + fid + ": " + func)
             fid_to_finc[fid] = func
     for feature_index, feature in enumerate(genome["features"]):
         feature_id = feature["id"]
         if feature_id in fid_to_finc:
             feature['function'] = fid_to_finc[feature_id]
     prov = ctx.provenance()
     info = ga.save_one_genome_v1({'workspace': params['output_workspace'], 
                                   'name': params['output_genome_name'],
                                   'data': genome, 'provenance': prov})['info']
     genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
     print("Genome saved to " + genome_ref)
     #END annotate_genes
     pass
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'provenance': [{
                'service': 'GenomeFileUtil',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('GenomeFileUtil'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.ws = workspaceService(cls.wsURL, token=token)
        cls.gaa = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
        cls.serviceImpl = GenomeFileUtil(cls.cfg)

        # create one WS for all tests
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        ret = cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName
Exemplo n.º 10
0
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None
 def load_genome_features_prepare_fasta(self, genome_refs, compliant_fasta_dir):
     feature_info = {}
     os.makedirs(compliant_fasta_dir)
     for genome_pos, genome_ref in enumerate(genome_refs):
         ############################# Genome loading ##########################
         self.log_line("Loading Genome object from workspace for ref [" + 
                       genome_ref + "]")
         info = self.ws.get_object_info_new({"objects": [{"ref": genome_ref}]})[0]
         genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
         gaapi = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'], token=self.token)
         genome_combined = gaapi.get_combined_data({"ref": genome_ref, "exclude_genes": 1, 
                                                    "exclude_summary": 1})
         cds_map = genome_combined["feature_by_id_by_type"][genome_combined["cds_type"]]
         protein_map = genome_combined["protein_by_cds_id"]
         cds_ids = list(cds_map.keys())
         ############################# Features + Fasta ##########################
         self.log_line("Preparing fasta file for ref [" + genome_ref + "]")
         genome_id = str(genome_pos + 1)
         records = []
         for feature_pos, feature_id in enumerate(cds_ids):
             cds = cds_map[feature_id]
             if feature_id not in protein_map:
                 continue
             protein = protein_map[feature_id]
             if "protein_amino_acid_sequence" in protein:
                 sequence = protein["protein_amino_acid_sequence"]
                 id = str(feature_pos + 1)
                 record = SeqRecord(Seq(sequence), id=id, description="")
                 records.append(record)
                 func = None
                 if "protein_function" in protein:
                     func = protein["protein_function"]
                 if ((not func) or len(func) == 0) and "feature_function" in cds:
                     func = cds["feature_function"]
                 feature_info[genome_id + "|" + id] = {"fid": feature_id, "fpos": 
                         feature_pos, "gref": genome_ref, "func": func}
         fasta_file = self.scratch + "/" + genome_id + ".fasta"
         SeqIO.write(records, fasta_file, "fasta")
         ############################# Adjusting Fasta by Orthomcl ##########################
         self.log_line("Running orthomclAdjustFasta for ref [" + genome_ref + "]")
         self.log_process(subprocess.Popen(["perl", self.plbin + "/orthomclAdjustFasta", 
                 genome_id, fasta_file, "1"], cwd=compliant_fasta_dir, 
                 stdout=subprocess.PIPE, stderr=subprocess.PIPE))
     return feature_info
 def setUpClass(cls):
     token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('GenomeSearchUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     authServiceUrl = cls.cfg.get(
         'auth-service-url',
         "https://kbase.us/services/authorization/Sessions/Login")
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'user_id':
         user_id,
         'provenance': [{
             'service': 'GenomeSearchUtil',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     cls.cfg['genome-index-dir'] = cls.cfg['scratch']
     cls.cfg['debug'] = "1"
     cls.scratch = cls.cfg['scratch']
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = GenomeSearchUtil(cls.cfg)
     suffix = int(time.time() * 1000)
     cls.wsName = "test_SaveGenomeTest_" + str(suffix)
     cls.wsClient.create_workspace({'workspace': cls.wsName})
     cls.ga_client = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
     cls.banno_ref = cls.load_genome_direct(
         'data/b.anno.2.genome.json',
         'b.anno.2',
         contigset_filename='data/b.anno.2.contigs.json')
     cls.rhodo_ref = cls.load_genome_direct(
         'data/rhodobacter.json',
         'rhodobacter',
         contigset_filename='data/rhodobacter_contigs.json')
     cls.eco_ref = cls.load_genome_direct('data/new_ecoli_genome.json',
                                          'ecoli',
                                          'data/e_coli_assembly.fasta',
                                          gtype="KBaseGenomes.Genome")
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_functional_enrichment_1'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_functional_enrichment_1',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_functional_enrichment_1(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.fe1_runner = FunctionalEnrichmentUtil(cls.cfg)
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.gaa = GenomeAnnotationAPI(cls.callback_url)
        cls.ws = Workspace(cls.wsURL, token=token)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_functional_enrichment_1_" + str(suffix)
        cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.prepare_data()
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('PanGenomeAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'PanGenomeAPI',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = PanGenomeAPI(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        shutil.rmtree(cls.scratch)
        os.mkdir(cls.scratch)
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        suffix = int(time.time() * 1000)
        wsName = "test_pangenome_api_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': wsName})
        cls.gcs = GenomeComparisonSDK(cls.callback_url)
        cls.gaa = GenomeAnnotationAPI(cls.callback_url)
        cls.prepare_data()
Exemplo n.º 15
0
    def generate_cummerbund_plot2(self, ctx, cummerbundstatParams):
        """
        :param cummerbundstatParams: instance of type "cummerbundstatParams"
           -> structure: parameter "workspace" of String, parameter
           "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of
           type "ws_diffstat_output" (Differential stat workspace id)
        :returns: instance of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plot2
        params = cummerbundstatParams
        returnVal = params['ws_cummerbund_output']

        # Set up workspace client
        user_token = ctx['token']
        ws_client = Workspace(url=self.__WS_URL, token=user_token)

        # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name': params['ws_cuffdiff_id'],
            'workspace': params['workspace']
        }])
        print "Getting genome info"

        genome_ref = s_res[0]['data']['genome_id']
        # genome_ref = '2702/6/2'
        # genome_ref = '2702/26/1'
        # genome_ref = '2229/21/10'
        print genome_ref
        gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token)
        genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}],
                                      "included_fields": ["scientific_name"],
                                      "included_feature_fields": ["id", "function", "type"
                                                                  ]})["genomes"][0]["data"]
        genome_dict = {}
        features = genome['features']
        for feature in features:
            id = feature['id']
            try:
                function = feature['function']
                if not function:
                    function = 'Unknown'
            except:
                function = 'Unknown'
            genome_dict[id] = function

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = script_util2.extract_cuffdiff_data(self.__LOGGER, self.__SHOCK_URL,
                                                          self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject = dict()

        # Prepare output plot list
        cummerbundplotset = []
        # List of plots to generate
        plotlist = [
            {'file': "dispersionplot.R",
             'title': "Dispersion plot",
             'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM."},

            {'file': "fpkmscvplot.R",
             'title': "Genes CV plot",
             'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data."},

            {'file': "isoformscvplot.R",
             'title': "Isoform CV plot",
             'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates."},

            {'file': "densityplot.R",
             'title': "Density plot",
             'description': "The density plot shows the distribution of FPKM scores across samples"},

            {'file': "csdensityrepplot.R",
             'title': "Replicates density plot",
             'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates"},

            {'file': "boxplot.R",
             'title': "Box plots",
             'description': "The box plots show the FPKM distribution across samples."},

            {'file': "boxrepplot.R",
             'title': "Box plots of replicates",
             'description': "The box plots of replicates show the FPKM distribution across sample replicates."},

            {'file': "pairwisescatterplots.R",
             'title': "Pairwise scatter plots",
             'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line."},

            {'file': "volcanomatrixplot.R",
             'title': "Volcano matrix plots",
             'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off."},

            {'file': "pcaplot.R",
             'title': "PCA plot",
             'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions."},

            {'file': "pcarepplot.R",
             'title': "PCA plot including replicates",
             'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates."},

            {'file': "mdsplot.R",
             'title': "Multi-dimensional scaling plot",
             'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. "},

            {'file': "mdsrepplot.R",
             'title': "Multi-dimensional scaling plot including replicates",
             'description': "Multi-dimensional scaling plot including replicates are  similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions."}
        ]

        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS,
                                                 plot['file'], self.__SHOCK_URL, self.__HS_URL,
                                                 user_token,
                                                 cummerbundplotset, plot['title'],
                                                 plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info("Problem generating image and json file - " + plot["file"])

        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        # TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace": params['workspace'],
            "objects": [{
                "type": "KBaseRNASeq.cummerbund_output",
                "data": outputobject,
                "name": params["ws_cummerbund_output"]}]
        })

        infile = join(cuffdiff_dir, "gene_exp.diff")
        outfile = join(cuffdiff_dir, "gene_exp_diff.out")
        x = v.volcano_plot_data_parse_and_upload(infile, outfile, genome_dict)
        with open(outfile) as f:
            statdata = json.load(f)
        res = ws_client.save_objects({
            "workspace": params['workspace'],
            "objects": [{
                "type": "KBaseRNASeq.DifferentialExpressionStat",
                "data": statdata,
                "name": params["ws_diffstat_output"]}]
        })

        #END generate_cummerbund_plot2

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plot2 return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
Exemplo n.º 16
0
    def get_enhancedFEM(self, params, tok):
        """
        implements get_enhancedFilteredExpressionMatrix() method
        """

        self.ws_client = Workspace(self.ws_url, token=tok)

        if 'fem_object_ref' not in params:
            raise ValueError(
                "fem_object_ref parameter not given to get_enhancedFilteredExpressionMatrix"
            )

        fem_object_ref = params.get('fem_object_ref')

        fem_obj_ret = self.ws_client.get_objects2(
            {'objects': [{
                'ref': fem_object_ref
            }]})['data'][0]
        fem = fem_obj_ret.get('data')
        prov = fem_obj_ret.get('provenance')[0]

        # create the enhanced FEM, starting with the FEM

        efem = {}
        for k in ['genome_ref', 'scale', 'type']:
            efem[k] = fem.get(k)

        efem['data'] = {}
        efem['data']['col_ids'] = [
            "description", "fold-change", "q-value", "min", "max", "mean",
            "std_dev", "is_missing_values"
        ]
        efem['data']['column_labels'] = [
            "Description", "Fold change", "Q value", "Min. expression",
            "Max. expression", "Mean expression", "Std. dev.",
            "Missing values?"
        ]
        fm = fem.get('data')
        efem['data']['row_ids'] = fm.get('row_ids')
        efem['data']['values'] = []
        n_efem_rows = len(efem['data']['row_ids'])
        fvals = fm.get('values')
        if (len(fvals) != n_efem_rows):
            raise Exception(
                "length discrepancy in filtered expression matrix: {0} row_ids but {1} values"
                .format(n_efem_rows, len(fvals)))

        # Get genome object and feature descriptions as a handy feature-indexed dict

        # moved from constructor
        gaa = GenomeAnnotationAPI(self.serviceWizardURL, token=tok)
        feat_dict = gaa.get_feature_functions({
            'ref': fem.get('genome_ref'),
            'feature_id_list': None
        })

        # if this FEM has a "resolved_ws_objects" record in its provenance,
        # then that should be a list of one DEM reference from which we get the FC and q values
        # as a feature (=row_id) -indexed dict.

        if fem.get('diff_expr_matrix_ref'):
            dem_ref = fem.get('diff_expr_matrix_ref')
            dem_obj_ret = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': dem_ref
                }]})['data'][0]

            dem = dem_obj_ret.get('data')
            dem_dict = self.convert_dem_to_dict(
                dem.get('data'))  # convert to dictionary for quick lookups
        else:
            dem_dict = {}  # empty dictionary

        # for each row

        for row_id, fm_val_row in zip(fm.get('row_ids'), fvals):

            # make a new row with NA for description, FC and q

            new_values_row = ['NA', 'NA', 'NA'
                              ] + self.get_matrix_stats(fm_val_row)

            # if we have a description for this feature (row_id) put it in the first column

            desc = feat_dict.get(row_id)
            if desc:
                new_values_row[
                    0] = desc  # leave as 'NA' if no entry in feat_dict

            # if we have a DEM entry for this row, put FC and q into 2nd and 3rd columns
            d = dem_dict.get(row_id)
            if d:
                new_values_row[1], new_values_row[2] = d

            # finally, add this row to the eFEM

            efem['data']['values'].append(new_values_row)

        return efem
Exemplo n.º 17
0
class ProkkaUtils:

    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key + " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4: -1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple("renamed_assembly",
                                      "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old,
                                records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The  directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix",
                           "mygenome", "--kingdom", kingdom]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError("gram parameter is not supported in current Prokka installation")
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {"id": fid, "location": location, "type": "gene",
                               "aliases": aliases, "md5": md5, "dna_sequence": dna,
                               "dna_sequence_length": len(dna),
                               }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                         "evidence": [evidence],
                                                         "term_name": sso_item["name"],
                                                         "ontology_ref": self.sso_ref,
                                                         "term_lineage": []}
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid,
                               "parent_mrna": mrna_id, "function": (product if product else ""),
                               "ontology_terms": {}, "protein_translation": prot,
                               "protein_translation_length": prot_len, "aliases": aliases}
                        mrna = {"id": mrna_id, "location": location, "md5": md5,
                                "parent_gene": fid, "cds": cds_id}
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n"
        report += "Average protein length: " + str(int(sum(prot_lengths) /
                                                       float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                             "evidence": [evidence],
                                                             "term_name": sso_item["name"],
                                                             "ontology_ref": self.sso_ref,
                                                             "term_lineage": []}

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(self.scratch,
                                                 "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.")

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """

        :param annotation_args: genome_data, new_annotations from prokka, and the output_genome_name
        :type
        :return:
        """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0,
                 "found_functions": 0, "new_ontologies": 0}

        function_report_filepath = os.path.join(self.scratch, "ontology_report")
        ontology_report_filepath = os.path.join(self.scratch, "function_report")
        onto_r = open(function_report_filepath, "w")
        func_r = open(ontology_report_filepath, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i]["function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [new_function]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)
                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)
            if current_function:
                func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n")
            else:
                func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n")

            onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        info = self.gfu.save_one_genome({"workspace": self.output_workspace,
                                         "name": annotation_args["output_genome_name"],
                                         "data": genome_data["data"],
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        annotated_genome = namedtuple("annotated_genome",
                                      "genome_ref function_report_filepath ontology_report_filepath stats")

        return annotated_genome(genome_ref, function_report_filepath, ontology_report_filepath,
                                stats)

    def upload_file(self, filepath, message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id)
        return {"shock_id": output_file_shock_id,
                "name": os.path.basename(filepath),
                "label": os.path.basename(filepath),
                "description": message}

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [self.upload_file(genome.ontology_report_filepath),
                      self.upload_file(genome.function_report_filepath)]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n"
                          ).format(genome_ref, stats["current_functions"], stats["new_functions"],
                                   stats["new_ontologies"])

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "file_links": file_links,
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": self.output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data,
                                                                     new_annotations=new_annotations,
                                                                     output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params, "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath,
                                                       cds_to_dna=prokka_results.cds_to_dna,
                                                       cds_to_prot=prokka_results.cds_to_dna,
                                                       new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {"id": "Unknown",
                  "features": annotated_assembly.features,
                  "scientific_name": scientific_name,
                  "domain": domain,
                  "genetic_code": gcode,
                  "assembly_ref": assembly_ref,
                  "cdss": annotated_assembly.cdss,
                  "mrnas": annotated_assembly.mrnas,
                  "source": "PROKKA annotation pipeline",
                  "gc_content": assembly_info.gc_content,
                  "dna_size": assembly_info.dna_size,
                  "reference_annotation": 0}

        info = self.gfu.save_one_genome({"workspace": output_workspace,
                                         "name": output_genome_name,
                                         "data": genome,
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}
Exemplo n.º 18
0
    def generate_cummerbund_plot2(self, ctx, cummerbundstatParams):
        """
        :param cummerbundstatParams: instance of type "cummerbundstatParams"
           -> structure: parameter "workspace" of String, parameter
           "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of
           type "ws_diffstat_output" (Differential stat workspace id)
        :returns: instance of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plot2
        params    = cummerbundstatParams
        returnVal = params['ws_cummerbund_output']

        #Set up workspace client
        user_token = ctx['token']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)


        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : params['ws_cuffdiff_id'],
            'workspace' : params['workspace']
            }])
        print "Getting genome info"

        genome_ref = s_res[0]['data']['genome_id']
        #genome_ref = '2702/6/2'
        #genome_ref = '2702/26/1'
        #genome_ref = '2229/21/10'
        print genome_ref
        gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token)
        genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}],
                                          "included_fields": ["scientific_name"],
                                          "included_feature_fields": ["id", "function", "type"
                                                                      ]})["genomes"][0]["data"]
        genome_dict = {}
        features = genome['features']
        for feature in features:
          id = feature['id']
          try: 
            function = feature['function']
            if not function:
              function = 'Unknown'
          except:
             function = 'Unknown'
          genome_dict[id] = function


        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject=dict()

        # Prepare output plot list
        cummerbundplotset=[]
        # List of plots to generate
        plotlist = [
                { 'file': "dispersionplot.R",
                  'title': "Dispersion plot",
                  'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." },


                { 'file': "fpkmscvplot.R",
                  'title': "Genes CV plot",
                  'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." },

                { 'file': "isoformscvplot.R",
                  'title': "Isoform CV plot",
                  'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." },

                { 'file': "densityplot.R",
                  'title': "Density plot",
                  'description': "The density plot shows the distribution of FPKM scores across samples" },

                { 'file': "csdensityrepplot.R",
                  'title': "Replicates density plot",
                  'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" },

                { 'file': "boxplot.R",
                  'title': "Box plots",
                  'description': "The box plots show the FPKM distribution across samples." },

                { 'file': "boxrepplot.R",
                  'title': "Box plots of replicates",
                  'description': "The box plots of replicates show the FPKM distribution across sample replicates." },

                { 'file': "pairwisescatterplots.R",
                  'title': "Pairwise scatter plots",
                  'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." },

                 { 'file': "volcanomatrixplot.R",
                  'title': "Volcano matrix plots",
                  'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." },

                { 'file': "pcaplot.R",
                  'title': "PCA plot",
                  'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." },

                { 'file': "pcarepplot.R",
                  'title': "PCA plot including replicates",
                  'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." },

                { 'file': "mdsplot.R",
                  'title': "Multi-dimensional scaling plot",
                  'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " },

                { 'file': "mdsrepplot.R",
                  'title': "Multi-dimensional scaling plot including replicates",
                  'description': "Multi-dimensional scaling plot including replicates are  similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." }
            ]


        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS,
                plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token,
                cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info("Problem generating image and json file - " + plot["file"])


        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        #TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace":params['workspace'],
            "objects": [{
                "type":"KBaseRNASeq.cummerbund_output",
                "data":outputobject,
                "name":params["ws_cummerbund_output"]}]
            })

        infile =  join(cuffdiff_dir, "gene_exp.diff") 
        outfile = join(cuffdiff_dir, "gene_exp_diff.out") 
        x=v.volcano_plot_data_parse_and_upload(infile,outfile, genome_dict)
        with open(outfile) as f:
            statdata = json.load(f)
        res = ws_client.save_objects({
            "workspace":params['workspace'],
            "objects": [{
                "type":"KBaseRNASeq.DifferentialExpressionStat",
                "data":statdata,
                "name":params["ws_diffstat_output"]}]
            })

        #END generate_cummerbund_plot2

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plot2 return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
Exemplo n.º 19
0
class vConTACT:
    '''
    Module Name:
    vConTACT

    Module Description:
    A KBase module: vConTACT
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/bolduc/vcontact"
    GIT_COMMIT_HASH = "ff92f754f02d757aa925d2327fc8ef2bf0af4b07"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        #END_CONSTRUCTOR
        pass

    def run_vcontact(self, ctx, params):
        """
        :param params: instance of type "InParams" -> structure: parameter
           "genome" of type "obj_ref" (Insert your typespec information here.)
        """
        # ctx is the context object
        #BEGIN run_vcontact
        self.callback_url = os.environ['SDK_CALLBACK_URL']

        vc = vConTACTUtils(self.config)

        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        genome = params['genome']
        genome_data = self.genome_api.get_genome_v1(
            {"genomes": [{
                "ref": genome
            }]})

        gene2genome, sequences = vc.genome_to_inputs(genome_data)

        gene2genome_fp, sequences_fp = vc.write_inputs(gene2genome, sequences)

        params['gene2genome'] = gene2genome_fp
        params['sequences'] = sequences_fp

        returnVal = vc.run_vcontact(params)

        vc.vcontact_help()

        kbo = KBObjectUtils(self.config)
        kbo.create_report(params['workspace_name'])

        #END run_vcontact
        pass

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Exemplo n.º 20
0
    def build_gff_file(self, getGenomeOptions, output_dir, output_filename,
                       is_gtf):

        # first get subdata needed; forget about the metadata
        #getGenomeOptions['included_fields'] = []
        #getGenomeOptions['included_feature_fields'] = ['id', 'type', 'location']
        getGenomeOptions['no_metadata'] = 1
        if 'included_fields' in getGenomeOptions:
            del getGenomeOptions['included_fields']
        if 'included_feature_fields' in getGenomeOptions:
            del getGenomeOptions['included_feature_fields']

        api = GenomeAnnotationAPI(self.cfg.callbackURL)
        genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0]['data']

        # create the file
        try:
            file_ext = ".gtf" if is_gtf else ".gff"
            out_file_path = os.path.join(output_dir,
                                         output_filename + file_ext)
            print('Creating file: ' + str(out_file_path))
            output = open(out_file_path, 'w')
            features = []
            if 'features' in genome_data:
                for f in genome_data['features']:
                    features.append({
                        'id': f['id'],
                        'type': f['type'],
                        'location': f['location']
                    })
            if 'cdss' in genome_data:
                for f in genome_data['cdss']:
                    features.append({
                        'id': f['id'],
                        'type': 'CDS',
                        'location': f['location'],
                        'parent_gene': f['parent_gene'],
                        'parent_mrna': f['parent_mrna']
                    })
            if 'mrnas' in genome_data:
                for f in genome_data['mrnas']:
                    features.append({
                        'id': f['id'],
                        'type': 'mRNA',
                        'location': f['location'],
                        'parent_gene': f['parent_gene']
                    })
            mrna_map = {}  ## mrna_id -> <mRNA>
            gene_map = {}  ## gene_id -> <gene>
            ## gene is {'id': <>, 'location': [[contig,start,strand,len], ...],
            ##          'mrna_cds_pairs': [[<mRNA>, <CDS>], ...]}
            #gene_id_generation = 1
            #mrna_id_generation = 1
            for f in features:
                if f['type'] == 'mRNA':
                    mrna_map[f['id']] = f
                elif f['type'] != 'CDS':
                    gene_map[f['id']] = f
            ## Now let's go over CDSs
            for f in features:
                if f['type'] == 'CDS':
                    gene_id = f.get('parent_gene')
                    gene = None
                    if gene_id:
                        gene = gene_map.get(gene_id)
                    rename_cds = False
                    if gene is None:
                        if gene_id is None:
                            gene_id = f[
                                'id']  #'gene_' + str(gene_id_generation)
                            #gene_id_generation += 1
                            rename_cds = True
                        gene = {
                            'id': gene_id,
                            'location': self.get_common_location(f['location'])
                        }
                        gene_map[gene_id] = gene
                    mrna_id = f.get('parent_mrna')
                    mrna = None
                    if mrna_id:
                        mrna = mrna_map.get(mrna_id)
                    if mrna is None:
                        if mrna_id is None:
                            mrna_id = f[
                                'id'] + '_mRNA'  # 'mRNA_' + str(mrna_id_generation)
                            #mrna_id_generation += 1
                        mrna = {'id': mrna_id, 'location': f['location']}
                        mrna_map[mrna_id] = mrna
                    if rename_cds:
                        f['id'] = f['id'] + '_CDS'
                    mrna_cds_pairs = gene.get('mrna_cds_pairs')
                    if mrna_cds_pairs is None:
                        mrna_cds_pairs = []
                        gene['mrna_cds_pairs'] = mrna_cds_pairs
                    mrna_cds_pairs.append([mrna, f])
            ## Let's sort genes by contigs
            contigs = []  ## contig is {'genes': []}
            contig_map = {}
            for gene_id in gene_map:
                gene = gene_map[gene_id]
                gene['start'] = self.get_start(gene['location'][0])
                contig_id = gene['location'][0][0]
                contig = contig_map.get(contig_id)
                if contig is None:
                    contig = {'id': contig_id, 'genes': []}
                    contig_map[contig_id] = contig
                    contigs.append(contig)
                contig['genes'].append(gene)

            for contig in contigs:
                contig['genes'].sort(key=lambda gene: gene['start'])

            # write the file
            exon_id_generation = 1
            for contig in contigs:
                contig_id = contig['id']
                for gene in contig['genes']:
                    gene_id = gene['id']
                    strand = gene['location'][0][2]
                    if not is_gtf:
                        self.write_gff_line(output, contig_id, 'gene',
                                            gene['start'],
                                            self.get_end(gene['location'][0]),
                                            strand, '.', gene_id, None)
                    if 'mrna_cds_pairs' not in gene:
                        continue
                    for [mrna, cds] in gene['mrna_cds_pairs']:
                        mrna_id = mrna['id']
                        mrna_loc = self.get_common_location(
                            mrna['location'])[0]
                        if not is_gtf:
                            self.write_gff_line(output, contig_id, 'mRNA',
                                                self.get_start(mrna_loc),
                                                self.get_end(mrna_loc), strand,
                                                '.', mrna_id, gene_id)
                        mrna_exons = self.get_location_as_sorted_exons(
                            mrna['location'], strand)
                        for exon in mrna_exons:
                            exon_id = 'exon_' + str(exon_id_generation)
                            exon_id_generation += 1
                            if is_gtf:
                                self.write_gtf_line(output, contig_id, 'exon',
                                                    exon['start'], exon['end'],
                                                    strand, '.', gene_id,
                                                    mrna_id)
                            else:
                                self.write_gff_line(output, contig_id, 'exon',
                                                    exon['start'], exon['end'],
                                                    strand, '.', exon_id,
                                                    mrna_id)
                        cds_exons = self.get_location_as_sorted_exons(
                            cds['location'], strand)
                        cds_id = cds['id']
                        frame = 0
                        for exon in cds_exons:
                            f_start = exon['start']
                            f_end = exon['end']
                            f_length = f_end - f_start + 1
                            if is_gtf:
                                self.write_gtf_line(output, contig_id, 'CDS',
                                                    f_start, f_end, strand,
                                                    frame, gene_id, mrna_id)
                            else:
                                self.write_gff_line(output, contig_id, 'CDS',
                                                    f_start, f_end, strand,
                                                    frame, cds_id, mrna_id)
                            frame = (3 - ((f_length - frame) % 3)) % 3

        except Exception, e:
            raise ValueError("Failed to create file: {0}".format(e))
Exemplo n.º 21
0
class vConTACTUtils:

    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.scratch = os.path.abspath(config['scratch'])
        self.ws = Workspace(config['workspace-url'], token=self.token)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)

    def vcontact_help(self):
        command = "vcontact --help"
        self._run_command(command)

    def execute(self, command: list):
        """
        :param command: Command suitable for running in subprocess, must use a ['ls', '-l'] format
        :return: Response from command
        """
        # logger.info('Running command: {}'.format(command))
        print('Running command: {}'.format(' '.join(command)))
        res = subprocess.run(command, shell=False, encoding='utf-8', check=True)

        return res

    def run_vcontact(self, params):

        # Determine KBase "inputs" for vConTACT2
        genome = params['genome']

        obj_type = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][2]

        if 'assembly' in obj_type.lower():  # If KBaseGenomeAnnotations.Assembly

            # Assembly requires annotation
            genome_fp = self.au.get_assembly_as_fasta({'ref': genome})['path']
            proteins_fp = os.path.join(self.scratch, 'proteins.faa')
            proteins_gbk = os.path.join(self.scratch, 'proteins.gbk')
            gene2genome_fp = os.path.join(self.scratch, 'gene2genome.csv')

            prodigal_cmd = ['prodigal', '-a', proteins_fp, '-o', proteins_gbk, '-f', 'gbk',
                            '-i', genome_fp, '-p', 'meta']
            res = self.execute(prodigal_cmd)

            records = {}
            with open(proteins_fp, 'r') as proteins_fh:
                for record in SeqIO.parse(proteins_fh, 'fasta'):

                    records[len(records)] = {
                        'protein_id': record.id,
                        'contig_id': record.id.rsplit('_', 1)[0],
                        'keywords': 'None'
                    }

            g2g_df = pd.DataFrame.from_dict(records, orient='index')
            g2g_df.to_csv(gene2genome_fp, index=False)

            # Pass filepaths to the app and run
            params['gene2genome'] = gene2genome_fp
            params['sequences'] = proteins_fp

        elif 'kbasegenomes' in obj_type.lower(): # If KBaseGenomes.Genome
            genome_data = self.genome_api.get_genome_v1({"genomes": [{"ref": genome}]})

            # Convert genome data into "reasonable" parse form and write to scratch filesystem
            gene2genome, sequences = self.genome_to_inputs(genome_data)
            gene2genome_fp, sequences_fp = self.write_inputs(gene2genome, sequences)

            # Pass filepaths to the app and run
            params['gene2genome'] = gene2genome_fp
            params['sequences'] = sequences_fp

        elif 'binnedcontigs' in obj_type.lower():  # If KBaseMetagenomes.BinnedContigs
            print('KBaseMetagenomes.BinnedContigs hasnt been enabled. Check back later.')
            exit(1)
        else:
            print('Unknown error in identifying object types')

        print('Available database files')
        print(os.listdir('/miniconda/lib/python3.7/site-packages/vcontact2/data/'))

        # Just iterate through all parameters
        mappings = {
            'gene2genome': '--proteins-fp',
            'sequences': '--raw-proteins',
            'db': '--db',
            'pcs_mode': '--pcs-mode',
            'vcs_mode': '--vcs-mode',
            'blast_evalue': '--blast-evalue',
            'pc_max_overlap': '--max-overlap',
            'pc_penalty': '--penalty',
            'pc_haircut': '--haircut',
            'pc_inflation': '--pc-inflation',
            'vc_inflation': '--vc-inflation',
            'vc_density': '--min-density',
            'vc_min_size': '--min-size',
            'vc_max_overlap': '--vc-overlap',
            'vc_penalty': '--vc-penalty',
            'vc_haircut': '--vc-haircut',
            'merge_method': '--merge-method',
            'similarity': '--similarity',
            'seed_method': '--seed-method',
            'min_significance': '--sig',
            'max_significance': '--max-sig',
            'module_inflation': '--mod-inflation',
            'mod_significance': '--mod-sig',
            'module_min_shared': '--mod-shared-min',
            'link_significance': '--link-sig',
            'link_proportion': '--link-prop'
        }

        bool_args = ['optimize', 'permissive']

        # Should create build_command?
        command = 'vcontact2 --output-dir outdir'
        # Binaries
        command += ' --diamond-bin /usr/local/bin/diamond --c1-bin /usr/local/bin/cluster_one-1.0.jar'

        for param, cmd in mappings.items():
            command += ' {} {}'.format(cmd, params[param])

        self._run_command(command)

        report = self._generate_report(params)

        return report

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def genome_to_inputs(self, genome):
        """
        genome_to_inputs: convert genome annotation data (~json) to file inputs required by vConTACT
        :param genome:
        :return:
        """

        records = []
        gene2genome = OrderedDict()

        genome_data = genome['genomes'][0]

        for item in genome_data['data']['features']:
            if 'id' not in item:
                continue
                print('This feature does not have a valid id')
            elif 'dna_sequence' not in item or 'protein_translation' not in item:
                continue
                print('This feature {} does not have a valid DNA sequence.'.format(item['id']))
            else:
                # Create FASTA file
                if item['type'] == 'gene':
                    desc = (item['functions'] if item.get('functions', None)
                                              else item.get('function', ''))
                    gene_record = SeqRecord(Seq(item['protein_translation']), id=item['id'],
                                            description=desc)
                    records.append(gene_record)

                    # Build gene2genome
                    gene2genome.update({
                        item['id']: {
                            # 'contig_id': genome_data['data']['contig_ids'][0],
                            'contig_id': item['location'][0][0],
                            'protein_id': item['id'],
                            'keywords': item['function']
                        }
                    })

        return gene2genome, records

    def write_inputs(self, mapping, sequences):

        fasta_for_proteins_fp = os.path.join(self.scratch, 'vConTACT_proteins.fasta')
        with open(fasta_for_proteins_fp, 'w') as fasta_for_proteins_fh:
            SeqIO.write(sequences, fasta_for_proteins_fh, 'fasta')

        genes_to_genomes_mapping_fp = os.path.join(self.scratch, 'vConTACT_gene2genome.csv')
        with open(genes_to_genomes_mapping_fp, 'w') as genes_to_genomes_mapping_fh:
            fields = ['contig_id', 'protein_id', 'keywords']
            writer = csv.DictWriter(genes_to_genomes_mapping_fh, fieldnames=fields)
            writer.writeheader()

            for gene in mapping.keys():
                writer.writerow(mapping[gene])

        return genes_to_genomes_mapping_fp, fasta_for_proteins_fp

    def _generate_report(self, params):
        """
        _generate_report: generate summary report

        This will contain ALL the logic to generate the report, including areas that should/will be re-factored later

        """

        # Get
        self.dfu = dfu(self.callback_url)

        # Get filepath of summary file
        summary_fp = os.path.join(os.getcwd(), 'outdir', 'genome_by_genome_overview.csv')

        summary_df = pd.read_csv(summary_fp, header=0, index_col=0)
        html = summary_df.to_html(index=False, classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(html_table=html)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(' style="text-align: right;"', '').replace('thead>',
                                                                                        'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:]

        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)
        result_fp = os.path.join(output_dir, 'index.html')

        with open(result_fp, 'w') as result_fh:
            result_fh.write(final_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id': report_shock_id,
            'name': os.path.basename(result_fp),
            'label': os.path.basename(result_fp),
            'description': 'HTML summary report for vConTACT2'
        }]

        report_params = {'message': 'Basic message to show in the report',
                         'workspace_name': params['workspace_name'],
                         'html_links': html_report,
                         'direct_html_link_index': 0,
                         'report_object_name': 'vConTACT_report_{}'.format(str(uuid.uuid4())),
                         # Don't use until have files to attach to report
                         # 'file_links': [{}],
                         # Don't use until data objects that are created as result of running app
                         # 'objects_created': [{'ref': matrix_obj_ref,
                         #                      'description': 'Imported Matrix'}],
                         }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        # https://stackoverflow.com/a/600612/643675
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise