Exemplo n.º 1
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('SetAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        authServiceUrl = cls.cfg.get('auth-service-url',
                "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'SetAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = SetAPI(cls.cfg)

        # setup data at the class level for now (so that the code is run
        # once for all tests, not before each test case.  Not sure how to
        # do that outside this function..)
        suffix = int(time.time() * 1000)
        wsName = "test_SetAPI_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': wsName})
#        wsName = 'pranjan77:1477441032423'
        cls.wsName = wsName
        # copy test file to scratch area
        fna_filename = "seq.fna"
        fna_path = os.path.join(cls.cfg['scratch'], fna_filename)
        shutil.copy(os.path.join("data", fna_filename), fna_path)

        ru = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        ws_obj_name = 'MyNewAssembly'
        cls.assembly1ref = ru.save_assembly_from_fasta( 
            {
                'file':{'path':fna_path},
                'workspace_name':wsName,
                'assembly_name':'assembly_obj_1'
            })
        cls.assembly2ref = ru.save_assembly_from_fasta( 
            {
                'file':{'path':fna_path},
                'workspace_name':wsName,
                'assembly_name':'assembly_obj_2'
            })
Exemplo n.º 2
0
    def test_filter_contigs_by_length_01(self):
        method = 'filter_contigs_by_length_01'

        print("\n\nRUNNING: test_filter_contigs_by_length_01()")
        print("===========================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        ass_file_1 = 'assembly_1.fa'
        ass_file_2 = 'assembly_2.fa'
        ass_path_1 = os.path.join(self.scratch, ass_file_1)
        ass_path_2 = os.path.join(self.scratch, ass_file_2)
        shutil.copy(os.path.join("data", ass_file_1), ass_path_1)
        shutil.copy(os.path.join("data", ass_file_2), ass_path_2)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            'assembly_1'
        })
        ass_ref_2 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_2
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            'assembly_2'
        })

        # run method
        input_refs = [ass_ref_1, ass_ref_2]
        base_output_name = method + '_output'
        params = {
            'workspace_name': self.getWsName(),
            'input_assembly_refs': input_refs,
            'min_contig_length': 1000,
            'output_name': 'test_filtered'
        }
        result = self.getImpl().run_filter_contigs_by_length(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
Exemplo n.º 3
0
 def save_assembly(self, wsname, output_contigs, token, name, console):
     self.log(console, 'Uploading FASTA file to Assembly')
     assemblyUtil = AssemblyUtil(self.callbackURL,
                                 token=token,
                                 service_ver='dev')
     assemblyUtil.save_assembly_from_fasta({
         'file': {
             'path': output_contigs
         },
         'workspace_name': wsname,
         'assembly_name': name
     })
    def getBogusAssembly(self):
        # Create a fake assembly with lots of contigs

        assembly_file_name = "bogus.fna"  # "AP009048.fna"
        assembly_temp_file = os.path.join("/kb/module/work/tmp",
                                          assembly_file_name)
        with open(assembly_temp_file, "w") as f:
            for i in range(1, 30002):
                f.write("> contig_%d\n" % i)
                f.write(
                    "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n"
                )

        assembly_name = "Assembly.2"
        au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"],
                          token=self.getContext()["token"])
        assembly_ref = au.save_assembly_from_fasta({
            "file": {
                "path": assembly_temp_file
            },
            "workspace_name":
            self.getWsName(),
            "assembly_name":
            assembly_name
        })
        self.assembly_ref = assembly_ref
        print("Uploaded bogus assembly " + str(assembly_ref))
        return assembly_ref
Exemplo n.º 5
0
    def load_test_genome_direct(self, filename, assembly_filename, obj_name):
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            obj_name + '.assembly',
            'file': {
                'path': assembly_filename
            }
        })
        pprint('created test assembly: ' + assembly_ref)

        with open(filename, 'r') as file:
            data_str = file.read()
        data = json.loads(data_str)
        data['assembly_ref'] = assembly_ref
        # save to ws
        save_info = {
            'workspace':
            self.getWsName(),
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': obj_name + '.genome'
            }]
        }
        result = self.ws.save_objects(save_info)
        info = result[0]
        ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        print('created test genome: ' + ref + ' from file ' + filename)
        return ref
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     genome_name = "Genome.1"
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': assembly_ref,
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': None,
                                               'genus': '',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
Exemplo n.º 7
0
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     # Add a genome to the WS to test ref_paths
     genome_name = "Genome.1"
     genome = {'id': 'Unknown', 'features': [],
               'scientific_name': "",
               'domain': "", 'genetic_code': 0,
               'assembly_ref': assembly_ref,
               'cdss': [], 'mrnas': [],
               'source': 'Magic!',
               'gc_content': 0, 'dna_size': 0,
               'reference_annotation': 0}
     prov = self.getContext().provenance()
     ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
     info = ga.save_one_genome_v1(
         {'workspace': self.getWsName(), 'name': genome_name,
          'data': genome, 'provenance': prov})['info']
     genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': "{};{}".format(genome_ref, assembly_ref),
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': 0,
                                               'genus': 'genus',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
     genome_ref = self.getWsName() + "/" + genome_name
     genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data']
     features_to_work = {}
     for feature in genome['features']:
         features_to_work[feature['id']] = feature['location']
     aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 
                                             'assembly_ref': genome['assembly_ref']})['dna_sequences']
     bad_dnas = 0
     for feature in genome['features']:
         if feature['dna_sequence'] != dna_sequences[feature['id']]:
             bad_dnas += 1
     self.assertEqual(bad_dnas, 0)
 def load_fasta_file(self, filename, obj_name, contents):
     f = open(filename, 'w')
     f.write(contents)
     f.close()
     assemblyUtil = AssemblyUtil(self.callback_url)
     assembly_ref = assemblyUtil.save_assembly_from_fasta({'file': {'path': filename},
                                                           'workspace_name': self.getWsName(),
                                                           'assembly_name': obj_name
                                                           })
     return assembly_ref
Exemplo n.º 9
0
 def load_fasta_file(self, path, name):
     assembly_util = AssemblyUtil(self.callback_url)
     return assembly_util.save_assembly_from_fasta({
         'file': {
             'path': path
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         name
     })
Exemplo n.º 10
0
 def loadAssembly(self):
     if hasattr(self.__class__, 'assembly_ref'):
         return self.__class__.assembly_ref
     fasta_path = os.path.join(self.scratch, 'test.fna')
     shutil.copy(os.path.join('data', 'test.fna'), fasta_path)
     au = AssemblyUtil(self.callback_url)
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': fasta_path},
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': 'test_assembly'
                                                 })
     self.__class__.assembly_ref = assembly_ref
     return assembly_ref
Exemplo n.º 11
0
 def get_fasta_file(self, filename, obj_name):
     assemblyUtil = AssemblyUtil(self.callback_url)
     assembly_ref = assemblyUtil.save_assembly_from_fasta({
         'file': {
             'path': filename
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         obj_name
     })
     return assembly_ref
Exemplo n.º 12
0
def loadFasta2Assembly(self, filename):
    fn, ext = os.path.splitext(filename)
    fasta_path = os.path.join(self.scratch, filename)
    shutil.copy(os.path.join('../testReads', filename), fasta_path)
    au = AssemblyUtil(self.callback_url)
    a_ref = au.save_assembly_from_fasta({
        'file': {
            'path': fasta_path
        },
        'workspace_name': self.getWsName(),
        'assembly_name': fn
    })
    return a_ref
Exemplo n.º 13
0
 def get_genome_ref(self, ws_name, tf='ecoliMG1655.fa'):
     if hasattr(self.__class__, 'genomeInfo'):
         return self.__class__.genomeInfo
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     target = os.path.join(self.scratch, tf)
     self.genome_path = target
     shutil.copy('data/' + tf, target)
     self.__class__.genomeInfo = au.save_assembly_from_fasta({
         'file': {
             'path': target
         },
         'workspace_name':
         ws_name,
         'assembly_name':
         tf.split('.fa')[0]
     })
     return self.__class__.genomeInfo
Exemplo n.º 14
0
 def load_fasta_file(self, filename, obj_name, contents):
     f = open(filename, 'w')
     # TODO make this use the data folder (not sure of relative path)
     f.write(contents)
     f.close()
     assemblyUtil = AssemblyUtil(self.callback_url)
     # TODO why does this next line take forevverr
     assembly_ref = assemblyUtil.save_assembly_from_fasta({
         'file': {
             'path': filename
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         obj_name
     })
     return assembly_ref
Exemplo n.º 15
0
def load_fasta_file(callback_url, ws_name, filename, obj_name, contents):
    """
    Loads the given FASTA file into a workspace as an Assembly object.
    """
    f = open(filename, 'w')
    f.write(contents)
    f.close()
    assembly_util = AssemblyUtil(callback_url)
    assembly_ref = assembly_util.save_assembly_from_fasta({
        'file': {
            'path': filename
        },
        'workspace_name':
        ws_name,
        'assembly_name':
        obj_name
    })
    return assembly_ref
Exemplo n.º 16
0
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     genome_name = "Genome.1"
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': assembly_ref,
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': 0,
                                               'genus': 'genus',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
     genome_ref = self.getWsName() + "/" + genome_name
     genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data']
     features_to_work = {}
     for feature in genome['features']:
         features_to_work[feature['id']] = feature['location']
     aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 
                                             'assembly_ref': genome['assembly_ref']})['dna_sequences']
     bad_dnas = 0
     for feature in genome['features']:
         if feature['dna_sequence'] != dna_sequences[feature['id']]:
             bad_dnas += 1
     self.assertEqual(bad_dnas, 0)
Exemplo n.º 17
0
 def loadAssembly(self):
     if hasattr(self.__class__, 'assembly_ref'):
         return self.__class__.assembly_ref
     # return '23735/1/1'
     fasta_path = os.path.join(self.scratch, 'test_ref.fa')
     shutil.copy(os.path.join('data', 'bt_test_data', 'test_ref.fa'),
                 fasta_path)
     au = AssemblyUtil(self.callback_url)
     assembly_ref = au.save_assembly_from_fasta({
         'file': {
             'path': fasta_path
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         'test_assembly'
     })
     self.__class__.assembly_ref = assembly_ref
     print('Loaded Assembly: ' + assembly_ref)
     return assembly_ref
Exemplo n.º 18
0
 def loadAssembly(self):
     if hasattr(self.__class__, 'assembly_ref'):
         return self.__class__.assembly_ref
     fasta_path = os.path.join(self.scratch, 'star_test_assembly.fa')
     #shutil.copy(os.path.join('../work/testReads', 'test_reference.fa'), fasta_path)
     shutil.copy(
         os.path.join('../work/testReads',
                      'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa'),
         fasta_path)
     au = AssemblyUtil(self.callback_url)
     assembly_ref = au.save_assembly_from_fasta({
         'file': {
             'path': fasta_path
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         'star_test_assembly'
     })
     self.__class__.assembly_ref = assembly_ref
     print('Loaded Assembly: ' + assembly_ref)
     return assembly_ref
Exemplo n.º 19
0
    def upload_assembly(self, file_path, workspace_name, assembly_name):
        """
        From a list of file paths, uploads them to KBase, generates Assembly objects,
        then returns the generated UPAs.
        """
        if not file_path:
            raise ValueError("file_path must be defined")
        if not os.path.exists(file_path):
            raise ValueError("The given assembly file '{}' does not exist".format(file_path))
        if not workspace_name:
            raise ValueError("workspace_name must be defined")
        if not assembly_name:
            raise ValueError("assembly_name must be defined")

        au = AssemblyUtil(self.callback_url)
        assembly_upa = au.save_assembly_from_fasta({
            "file": {
                "path": file_path
            },
            "workspace_name": workspace_name,
            "assembly_name": assembly_name
        })
        return assembly_upa
Exemplo n.º 20
0
    def test_annotate_contigs_too_big(self):
        """
        simulate a metagenome contig file
        """
        # Create a fake assembly with lots of contigs
        assembly_file_name = "bogus.fna"  #"AP009048.fna"
        assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
        with open(assembly_temp_file, 'w') as f:
            for i in range(1,30002):
                f.write('> contig_%d\n' % i)
                f.write('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n')

        assembly_name = 'Assembly.2'
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
        assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                    'workspace_name': self.getWsName(),
                                                    'assembly_name': assembly_name})
        genome_name = "Genome.1"
        # This should fail with an error
        with self.assertRaises(ValueError):
            result = self.getImpl().annotate_contigs(self.getContext(),
                                                     {'assembly_ref': assembly_ref,
                                                      'output_workspace': self.getWsName(),
                                                      'output_genome_name': genome_name,
                                                      'evalue': None,
                                                      'fast': 0,
                                                      'gcode': 0,
                                                      'genus': 'genus',
                                                      'kingdom': 'Bacteria',
                                                      'metagenome': 0,
                                                      'mincontiglen': 1,
                                                      'norrna': 0,
                                                      'notrna': 0,
                                                      'rawproduct': 0,
                                                      'rfam': 1,
                                                      'scientific_name': 'Super : diper - name;'
                                                      })
Exemplo n.º 21
0
    def filter_contigs(self, ctx, params):
        """
        The actual function is declared using 'funcdef' to specify the name
        and input/return arguments to the function.  For all typical KBase
        Apps that run in the Narrative, your function should have the
        'authentication required' modifier.
        :param params: instance of type "FilterContigsParams" (A 'typedef'
           can also be used to define compound or container objects, like
           lists, maps, and structures.  The standard KBase convention is to
           use structures, as shown here, to define the input and output of
           your function.  Here the input is a reference to the Assembly data
           object, a workspace to save output, and a length threshold for
           filtering. To define lists and maps, use a syntax similar to C++
           templates to indicate the type contained in the list or map.  For
           example: list <string> list_of_strings; mapping <string, int>
           map_of_ints;) -> structure: parameter "assembly_input_ref" of type
           "assembly_ref" (A 'typedef' allows you to provide a more specific
           name for a type.  Built-in primitive types include 'string',
           'int', 'float'.  Here we define a type named assembly_ref to
           indicate a string that should be set to a KBase ID reference to an
           Assembly data object.), parameter "workspace_name" of String,
           parameter "min_length" of Long
        :returns: instance of type "FilterContigsResults" (Here is the
           definition of the output of the function.  The output can be used
           by other SDK modules which call your code, or the output
           visualizations in the Narrative.  'report_name' and 'report_ref'
           are special output fields- if defined, the Narrative can
           automatically render your Report.) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "assembly_output" of type "assembly_ref" (A 'typedef'
           allows you to provide a more specific name for a type.  Built-in
           primitive types include 'string', 'int', 'float'.  Here we define
           a type named assembly_ref to indicate a string that should be set
           to a KBase ID reference to an Assembly data object.), parameter
           "n_initial_contigs" of Long, parameter "n_contigs_removed" of
           Long, parameter "n_contigs_remaining" of Long
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN filter_contigs

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Filter Contigs function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
              str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        print('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        print('returning:' + pformat(output))

        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 22
0
    def arast_run(self,
                  ctx,
                  params,
                  assembler,
                  server='http://localhost:8000'):
        output = None

        console = []
        self.log(console, 'Running run_{} with params='.format(assembler))
        self.log(console, pformat(params))

        #### do some basic checks
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_refs' not in params and 'read_library_names' not in params:
            raise ValueError(
                'read_library_refs or read_library_names parameter is required'
            )
        if 'read_library_refs' in params:
            if type(params['read_library_refs']) != list:
                raise ValueError('read_library_refs must be a list')
        if 'read_library_names' in params:
            if type(params['read_library_names']) != list:
                raise ValueError('read_library_names must be a list')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')
        min_contig_len = params.get('min_contig_len') or 300

        token = ctx['token']

        os.environ["KB_AUTH_TOKEN"] = token
        os.environ["ARAST_URL"] = server

        ws = workspaceService(self.workspaceURL)
        ws_libs = []
        if 'read_library_refs' in params:
            for lib_ref in params['read_library_refs']:
                ws_libs.append({'ref': lib_ref})
        if 'read_library_names' in params:
            for lib_name in params['read_library_names']:
                ws_libs.append(
                    {'ref': params['workspace_name'] + '/' + lib_name})
        if len(ws_libs) == 0:
            raise ValueError(
                'At least one read library must be provided in read_library_refs or read_library_names'
            )
        libs = ws.get_objects2({'objects': ws_libs})['data']

        wsid = libs[0]['info'][6]

        kbase_assembly_input = self.combine_read_libs(libs)
        tmp_data = self.create_temp_json(kbase_assembly_input)

        mode = ''
        cmd = ['ar-run', '--data-json', tmp_data]
        if assembler:
            cmd = cmd + ['-a', assembler]
            mode = 'assembler: ' + assembler
        elif 'pipeline' in params and params['pipeline']:
            cmd = cmd + ['-p', params['pipeline']]
            mode = 'assembly pipeline: ' + params['pipeline']
        else:
            cmd = cmd + ['-r', params.get('recipe', 'auto')]
            mode = 'assembly recipe: ' + params['recipe']

        logger.info('Start {}'.format(mode))
        logger.debug('CMD: {}'.format(' '.join(cmd)))

        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
        logger.debug(out)

        if p.returncode != 0:
            raise ValueError('Error running ar_run, return code: {}\n'.format(
                p.returncode))

        job_id = None
        match = re.search('(\d+)', out)
        if match:
            job_id = match.group(1)
        else:
            raise ValueError('No integer job ID found: {}\n'.format(out))

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        output_contigs = os.path.join(output_dir, 'contigs.fa')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        cmd = ['ar-get', '-j', job_id, '-w', '-l']
        logger.debug('CMD: {}'.format(' '.join(cmd)))
        ar_log = subprocess.check_output(cmd)

        self.log(console, ar_log)

        cmdstr = 'ar-get -j {} -w -p | ar-filter -l {} > {}'.format(
            job_id, min_contig_len, output_contigs)
        logger.debug('CMD: {}'.format(cmdstr))
        subprocess.check_call(cmdstr, shell=True)

        cmd = ['ar-get', '-j', job_id, '-w', '-r']
        logger.debug('CMD: {}'.format(' '.join(cmd)))
        ar_report = subprocess.check_output(cmd)

        self.log(console, "\nDONE\n")

        client = AssemblyUtil(self.callback_url)
        assembly_ref = client.save_assembly_from_fasta({
            'file': {
                'path': output_contigs
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            params['output_contigset_name']
        })

        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        if 'read_library_names' in params:
            provenance[0]['input_ws_objects'] = [
                params['workspace_name'] + '/' + x
                for x in params['read_library_names']
            ]
        elif 'read_library_refs' in params:
            provenance[0]['input_ws_objects'] = [
                x for x in params['read_library_refs']
            ]

        os.remove(tmp_data)
        #shutil.rmtree(output_dir)

        # create a Report
        report = ''
        report += '============= Raw Contigs ============\n' + ar_report + '\n'

        report += '========== Filtered Contigs ==========\n'
        report += 'ContigSet saved to: ' + params[
            'workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Average Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(
                edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print report

        reportObj = {
            'objects_created': [{
                'ref':
                params['workspace_name'] + '/' +
                params['output_contigset_name'],
                'description':
                'Assembled contigs'
            }],
            'text_message':
            report
        }

        reportName = '{}.report.{}'.format(assembler, job_id)
        report_obj_info = ws.save_objects({
            'id':
            wsid,
            'objects': [{
                'type': 'KBaseReport.Report',
                'data': reportObj,
                'name': reportName,
                'meta': {},
                'hidden': 1,
                'provenance': provenance
            }]
        })[0]

        output = {
            'report_name':
            reportName,
            'report_ref':
            str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' +
            str(report_obj_info[4])
        }

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return output
Exemplo n.º 23
0
    def filter_contigs(self, ctx, params):
        """
        :param workspace_name: instance of String
        :param params: instance of type "ContigFilterParams" (Input
           parameters) -> structure: parameter "assembly_ref" of String,
           parameter "min_length" of Long
        :returns: instance of type "ContigFilterResults" (Output results) ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String, parameter "filtered_assembly_ref" of
           String, parameter "n_total" of Long, parameter "n_remaining" of
           Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs
        for name in ['min_length', 'assembly_ref', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not isinstance(params['min_length'], int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')
        ws_name = params['workspace_name']
        assembly_util = AssemblyUtil(self.callback_url)
        file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})
        # Parse the downloaded file in FASTA format
        parsed_assembly = SeqIO.parse(file['path'], 'fasta')
        min_length = params['min_length']
        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1
        # Create a file to hold the filtered data
        filtered_path = os.path.join(self.scratch, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {'path': filtered_path},
            'workspace_name': ws_name,
            'assembly_name': file['assembly_name']
        })
        # Create an output summary message for the report
        text_message = "".join([
            'Filtered assembly to ',
            str(n_remaining),
            ' contigs out of ',
            str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        html_dir = os.path.join(self.scratch, 'html')
        html_index_path = os.path.join(html_dir, 'index.html')
        file_path = os.path.join(self.scratch, 'myfile.txt')
        with open(file_path, 'w') as f:
            f.write('hello world')
        os.mkdir(html_dir)
        with open(html_index_path, 'w') as f:
            f.write('<p><b>hello world</b></p>')
        print('xyz1', os.listdir(html_dir))
        print('xyz2', os.listdir(self.scratch))
        html_links = [{
            'path': os.path.join(html_dir, 'index.html'),
            'name': 'main.html',
            'description': 'Sample description'
        }]
        file_links = [{
            'path': file_path,
            'name': 'file.txt',
            'description': 'Sample file description'
        }] + html_links
        # Extended report
        report_data = {
            'objects_created': [{'ref': new_ref, 'description': 'Filtered contigs'}],
            'html_links': html_links,
            'file_links': file_links,
            'warnings': ['warning 1', 'warning 2'],
            'report_object_name': 'my_report',
            'direct_html': '<p>Hello</p>',
            'message': text_message,
            'workspace_name': ws_name,
            'direct_html_link_index': 0,
            'html_window_height': 800,
            'summary_window_height': 800
        }
        # # Simple report
        # report_data = {
        #     'report': {
        #         'text_message': 'My simple report text message',
        #         'warnings': ['warning 1', 'warning 2'],
        #         'objects_created': [{'ref': new_ref, 'description': 'filtered contigs'}]
        #     },
        #     'workspace_name': ws_name
        # }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create_extended_report(report_data)
        # Return the report reference and name in our results
        returnVal = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }
        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Exemplo n.º 24
0
    def arast_run(self, ctx, params, assembler, server='http://localhost:8000'):
        output = None

        console = []
        self.log(console,'Running run_{} with params='.format(assembler))
        self.log(console, pformat(params))

        #### do some basic checks
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_refs' not in params and 'read_library_names' not in params:
            raise ValueError('read_library_refs or read_library_names parameter is required')
        if 'read_library_refs' in params:
            if type(params['read_library_refs']) != list:
                raise ValueError('read_library_refs must be a list')
        if 'read_library_names' in params:
            if type(params['read_library_names']) != list:
                raise ValueError('read_library_names must be a list')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')
        min_contig_len = params.get('min_contig_len') or 300

        token = ctx['token']

        os.environ["KB_AUTH_TOKEN"] = token
        os.environ["ARAST_URL"] =  server

        ws = workspaceService(self.workspaceURL)
        ws_libs = []
        if 'read_library_refs' in params:
            for lib_ref in params['read_library_refs']:
                ws_libs.append({'ref': lib_ref})
        if 'read_library_names' in params:
            for lib_name in params['read_library_names']:
                ws_libs.append({'ref': params['workspace_name'] + '/' + lib_name})
        if len(ws_libs)==0:
            raise ValueError('At least one read library must be provided in read_library_refs or read_library_names')
        libs = ws.get_objects2({'objects': ws_libs})['data']

        wsid = libs[0]['info'][6]

        kbase_assembly_input = self.combine_read_libs(libs)
        tmp_data = self.create_temp_json(kbase_assembly_input)

        mode = ''
        cmd = ['ar-run', '--data-json', tmp_data]
        if assembler:
            cmd = cmd + ['-a', assembler]
            mode = 'assembler: ' + assembler
        elif 'pipeline' in params and params['pipeline']:
            cmd = cmd + ['-p', params['pipeline']]
            mode = 'assembly pipeline: ' + params['pipeline']
        else:
            cmd = cmd + ['-r', params.get('recipe', 'auto')]
            mode = 'assembly recipe: ' + params['recipe']

        logger.info('Start {}'.format(mode))
        logger.debug('CMD: {}'.format(' '.join(cmd)))

        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT, shell=False)

        out, err = p.communicate()
        logger.debug(out)

        if p.returncode != 0:
            raise ValueError('Error running ar_run, return code: {}\n'.format(p.returncode))

        job_id = None
        match = re.search('(\d+)', out)
        if match:
            job_id = match.group(1)
        else:
            raise ValueError('No integer job ID found: {}\n'.format(out))

        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)
        output_dir = os.path.join(self.scratch, 'output.'+str(timestamp))
        output_contigs = os.path.join(output_dir, 'contigs.fa')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        cmd = ['ar-get', '-j', job_id, '-w', '-l']
        logger.debug('CMD: {}'.format(' '.join(cmd)))
        ar_log = subprocess.check_output(cmd)

        self.log(console, ar_log)

        cmdstr = 'ar-get -j {} -w -p | ar-filter -l {} > {}'.format(job_id, min_contig_len, output_contigs)
        logger.debug('CMD: {}'.format(cmdstr))
        subprocess.check_call(cmdstr, shell=True)

        cmd = ['ar-get', '-j', job_id, '-w', '-r']
        logger.debug('CMD: {}'.format(' '.join(cmd)))
        ar_report = subprocess.check_output(cmd)

        self.log(console, "\nDONE\n")

        client = AssemblyUtil(self.callback_url)
        assembly_ref = client.save_assembly_from_fasta({
                        'file':{'path':output_contigs},
                        'workspace_name':params['workspace_name'],
                        'assembly_name':params['output_contigset_name']
               	})
        
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        if 'read_library_names' in params:
            provenance[0]['input_ws_objects']=[params['workspace_name']+'/'+x for x in params['read_library_names']]
        elif 'read_library_refs' in params:
            provenance[0]['input_ws_objects']=[x for x in params['read_library_refs']]


        os.remove(tmp_data)
        #shutil.rmtree(output_dir)

        # create a Report
        report = ''
        report += '============= Raw Contigs ============\n' + ar_report + '\n'

        report += '========== Filtered Contigs ==========\n'
        report += 'ContigSet saved to: '+params['workspace_name']+'/'+params['output_contigset_name']+'\n'
        report += 'Assembled into '+str(len(lengths)) + ' contigs.\n'
        report += 'Average Length: '+str(sum(lengths)/float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   '+str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c+1]) + ' bp\n'

        print report

        reportObj = {
            'objects_created':[{'ref':params['workspace_name']+'/'+params['output_contigset_name'], 'description':'Assembled contigs'}],
            'text_message': report
        }

        reportName = '{}.report.{}'.format(assembler, job_id)
        report_obj_info = ws.save_objects({
                'id': wsid,
                'objects': [
                    {
                        'type': 'KBaseReport.Report',
                        'data': reportObj,
                        'name': reportName,
                        'meta': {},
                        'hidden': 1,
                        'provenance': provenance
                    }
                ]
            })[0]

        output = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]) }

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return output
class kb_virsorterTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        user_id = requests.post(
            'https://kbase.us/services/authorization/Sessions/Login',
            data='token={}&fields=user_id'.format(token)).json()['user_id']
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_virsorter',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_virsorter'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = kb_virsorter(cls.cfg)

        cls.testobjref = []
        #cls.testobjdata = []
        cls.testwsname = []

    @classmethod
    def tearDownClass(cls):
        if hasattr(cls, 'wsName'):
            cls.wsClient.delete_workspace({'workspace': cls.wsName})
            print('Test workspace was deleted')

        if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0:
            try:
                print('Deleting workspace 2 ' + cls.testwsname[0])
                cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]})
                print('Test workspace 2 was deleted ' + cls.testwsname[0])
            except Exception as e:
                print e

        #if hasattr(cls, 'testobjdata'):
        #    try:
        #        print('Deleting shock data ' + str(len(cls.testobjdata)))
        #        print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0])))
        #        print('Deleting shock data ' + str(cls.testobjdata[0]))
        #        node = cls.testobjdata[0]['data'][0]['lib']['file']['id']
        #        cls.delete_shock_node(node)
        #        print('Test shock data was deleted')
        #    except Exception as e:
        #        print e

    def getWsClient(self):
        return self.__class__.wsClient

    def getWsName(self):
        if hasattr(self.__class__, 'wsName'):
            return self.__class__.wsName
        suffix = int(time.time() * 1000)
        wsName = "test_kb_virsorter_" + str(suffix)
        ret = self.getWsClient().create_workspace({'workspace': wsName})
        self.__class__.wsName = wsName
        return wsName

    def getImpl(self):
        return self.__class__.serviceImpl

    def getContext(self):
        return self.__class__.ctx
    
    
    def write_file(self, filename, content):
        tmp_dir = self.cfg['scratch']
        file_path = os.path.join(tmp_dir, filename)
        with open(file_path, 'w') as fh1:
            fh1.write(content)
        return file_path


    def delete_shock_node(self, node_id):
        header = {'Authorization': 'Oauth {0}'.format(cls.token)}
        requests.delete(cls.shockURL + '/node/' + node_id, headers=header,
                        allow_redirects=True)

    def ztest_aaa_upload_to_shock(self):

        print "upload ref data to shock staging"
        self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        #file_path =  self.write_file('Phage_gene_catalog.tar.gz', 'Test')

        input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz'
        source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name)

        tmp_dir = self.cfg['scratch']
        target_file_path = os.path.join(tmp_dir, input_file_name)

        print "file_path " + source_file_path+"\t"+target_file_path

        orig_size = os.path.getsize(source_file_path)

        shutil.copy(source_file_path, target_file_path)

        print "Testing "+target_file_path
        print(os.path.isfile(target_file_path))

        ret1 = self.dfUtil.file_to_shock(
            {'file_path': target_file_path})
        
        print str(ret1)
        shock_id = ret1['shock_id']
        
        print "shock_id "+shock_id
        file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz')

        #ret2 = self.dfUtil.shock_to_file(
        #    {'shock_id': shock_id, 'file_path': file_path2})[0]
        ret2 = self.dfUtil.shock_to_file(
            {'shock_id': shock_id, 'file_path': file_path2})

        print(ret2)

        file_name = ret2['node_file_name']
        attribs = ret2['attributes']
        self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz')
        self.assertEqual(ret2['file_path'], file_path2)
        self.assertEqual(ret2['size'], orig_size)
        self.assertIsNone(attribs)

        #self.delete_shock_node(shock_id)


    def create_random_string(self):
        N = 20
        return ''.join(
            random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N))

    def test_virsorter_ok(self):
        self.upload_assembly()


        if not self.testwsname:
            self.testwsname.append(self.create_random_string())

        print "upload_reads self.testwsname[0] " + self.testwsname[0]

        #try:
        #    ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  # test_ws_name
        #except Exception as e:
        #    # print "ERROR"
        #    # print(type(e))
        #    # print(e.args)
        #    print(e)
        #    pass

        print "self.testwsname "+ str(self.testwsname)
        params = {}
        params['assembly_ref'] =  str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref
        params['ws_name'] = self.testwsname[0]

        result = self.getImpl().run_virsorter(self.getContext(), params)
        print('RESULT run_virsorter:')
        pprint(result)

        #testresult = [
        #    {'blah': 'blah', 'bleh': 'bleh'}]

        testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}]


        self.assertEqual(sorted(result), sorted(testresult))


    def upload_assembly(self):
        if not self.testobjref:

            print "upload_assembly start"
    
            indata = 'U00096.2.fa'#_first1000.
            ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata)
            print "ftarget " + ftarget
            ret = shutil.copy('../test_data/' + indata, ftarget)
    
            #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

            self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])

            if not self.testwsname:
                self.testwsname.append(self.create_random_string())
    
            print "upload_assembly self.testwsname[0] " + self.testwsname[0]
    
            try:
                ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  #test_ws_name
            except Exception as e:
                #print "ERROR"
                #print(type(e))
                #print(e.args)
                print(e)
                pass
    
            try:
                print "attempt upload"
                print "ftarget " + ftarget
                ref = self.assemblyUtilClient.save_assembly_from_fasta(
                    {
                     'workspace_name': self.testwsname[0],
                     'assembly_name': 'Ecolik12MG1655',
                     'file': {'path': ftarget}})
        
                print "upload_assembly"
                print ref
                #self.testobjref = []
                self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1')
                #self.testobjdata = []
                #self.testobjdata.append(self.dfu.get_objects(
                #    {'object_refs': [self.testobjref[0]]}))
        
                ##print self.testobjdata[0]
    
            except Exception as e:
                print e
                pass
    
            print "self.testobjref[0]"
            print self.testobjref
            print self.testobjref[0]
Exemplo n.º 26
0
    def run_hipmer_hpc(self, ctx, params):
        """
        :param params: instance of type "AssemblyParams" (Run assembler
           workspace_name - the name of the workspace for input/output
           read_library_name - the name of the PE read library (SE library
           support in the future) output_contig_set_name - the name of the
           output contigset extra_params - assembler specific parameters
           min_contig_length - minimum length of contigs to output, default
           200 @optional min_contig_len @optional extra_params) -> structure:
           parameter "workspace_name" of String, parameter
           "read_library_name" of String, parameter "output_contigset_name"
           of String, parameter "min_contig_len" of Long, parameter
           "extra_params" of list of String
        :returns: instance of type "AssemblyOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_hipmer_hpc
        console = []
        self.log(console, 'Running run_hipmer_hpc with params=')
        self.log(console, pformat(params))

        # Validate parameters.  This will raise an error if there
        # is a problem.
        self._validate_inputs(params)
        ws_name = params['workspace_name']
        #ws = workspaceService(self.workspaceURL, token=ctx['token'])

        if 'POST' not in os.environ:
            # Get the read library
            print "Running pre stage"
            refs = []
            for read in params['reads']:
                read_name = read['read_library_name']
                if '/' in read_name:
                    ref = read_name
                else:
                    ref = ws_name + '/' + read_name
                refs.append(ref)
                read['ref'] = ref
            if not self.check_reads(ctx, refs, console):
                raise ValueError('The reads failed validation\n')

            params['readsfiles'] = self.get_reads_RU(ctx, refs, console)
            self.fixup_reads(params)

            # Generate submit script
            ts = self.generate_config(params)
            self.generate_submit(ts)
            return

        print "Running POST stage"

        # run hipmer, capture output as it happens
        self.log(console, 'running hipmer:')

        output_contigs = os.path.join(self.scratch, 'results',
                                      'final_assembly.fa')
        output_name = params['output_contigset_name']
        if not os.path.exists(output_contigs):
            print "It looks like HipMER failed for some reason."
            print "Show errors in log file"
            logfile = ''
            for fn in os.listdir('.'):
                if fn.startswith('slurm-'):
                    logfile = fn
            if logfile != '':
                with open(logfile, 'r') as f:
                    for line in f:
                        if line.lower().find('error') >= 0:
                            print line
            raise RuntimeError("Error in HipMER execution")

        wsname = params['workspace_name']
        #output_data_ref = self.save_assembly(wsname,
        #                                     output_contigs,
        #                                     ctx['token'],
        #                                     output_name,
        #                                     console)
        self.log(console, 'Uploading FASTA file to Assembly')
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver='dev')
        save_input = {
            'file': {
                'path': output_contigs
            },
            'workspace_name': wsname,
            'assembly_name': output_name
        }
        output_data_ref = assemblyUtil.save_assembly_from_fasta(save_input)
        # create a Report
        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/'
        report += params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   \%d\t--\t%d' % (counts[c], edges[c])
            report += ' to %d bp\n' % (edges[c + 1])

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({
                'files': [{
                    'path': output_contigs,
                    'label': params['output_contigset_name']
                }]
            })
        except QUASTError as qe:
            # not really any way to test this, all inputs have been checked
            # earlier and should be ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report({
                'message':
                report,
                'objects_created': [{
                    'ref': output_data_ref,
                    'description': 'Assembled contigs'
                }],
                'direct_html_link_index':
                0,
                'html_links': [{
                    'shock_id': quastret['shock_id'],
                    'name': 'report.html',
                    'label': 'QUAST report'
                }],
                'report_object_name':
                'kb_megahit_report_' + str(uuid.uuid4()),
                'workspace_name':
                params['workspace_name']
            })
        except _RepError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        return [output]
        #END run_hipmer_hpc

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_hipmer_hpc return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 27
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT.  Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           min_k - minimum kmer size (<= 127), must be odd number, default 21
           max_k - maximum kmer size (<= 127), must be odd number, default 99
           k_step - increment of kmer size of each iteration (<= 28), must be
           even number, default 10 k_list - list of kmer size (all must be
           odd, in the range 15-127, increment <= 28); override `--k-min',
           `--k-max' and `--k-step' min_contig_length - minimum length of
           contigs to output, default is 2000 @optional
           megahit_parameter_preset @optional min_count @optional k_min
           @optional k_max @optional k_step @optional k_list @optional
           min_contig_length) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {
            'read_libraries': [input_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError(
                        'min_contig_length parameter must be a non-negative integer'
                    )

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # set the number of cpus
        megahit_cmd.append('--num-cpu-threads')
        megahit_cmd.append(str(multiprocessing.cpu_count() - 1))

        # set the output location
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            raise ValueError('Error running MEGAHIT, return code: ' +
                             str(retcode) + '\n')

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs,
                        os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch,
                                          'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': output_contigs
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            params['output_contigset_name']
        })

        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params[
            'workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(
                edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({
                'files': [{
                    'path': output_contigs,
                    'label': params['output_contigset_name']
                }]
            })
        except QUASTError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report({
                'message':
                report,
                'objects_created': [{
                    'ref': output_data_ref,
                    'description': 'Assembled contigs'
                }],
                'direct_html_link_index':
                0,
                'html_links': [{
                    'shock_id': quastret['shock_id'],
                    'name': 'report.html',
                    'label': 'QUAST report'
                }],
                'report_object_name':
                'kb_megahit_report_' + str(uuid.uuid4()),
                'workspace_name':
                params['workspace_name']
            })
        except _RepError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 28
0
    def run_A5(self, ctx, params):
        """
        Run A5 on paired end libraries
        :param params: instance of type "A5_Params" (Input parameters for
           running A5. workspace_name - the name of the workspace from which
           to take input and store output. output_contigset_name - the name
           of the output contigset libfile_args - parameters for each input
           paired end reads min_contig_length - minimum length of contigs in
           the assembly output metagenome - metagenome option to A5 @optional
           min_contig_length @optional metagenome) -> structure: parameter
           "workspace_name" of String, parameter "output_contigset_name" of
           String, parameter "libfile_args" of list of type
           "libfile_args_type" (Parameters for a paired end library entry in
           the input 'libfile') -> structure: parameter "libfile_library" of
           type "paired_end_lib" (The workspace object name of a
           PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile
           type.), parameter "libfile_unpaired" of String, parameter
           "libfile_insert" of Long, parameter "min_contig_length" of Long,
           parameter "metagenome" of type "bool" (A boolean - 0 for false, 1
           for true. @range (0, 1))
        :returns: instance of type "A5_Output" (Output parameters for A5 run.
           string report_name - the name of the KBaseReport.Report workspace
           object. string report_ref - the workspace reference of the
           report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_A5
        
        print("===================  IN run_A5")

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_A5 with params:\n' + pformat(params))

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)
        pprint(params)

        token = ctx['token']

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        print("Workspace name: " + wsname)

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        outdir = os.path.join(self.scratch, 'A5_dir' + str(timestamp))

        reads = self.get_input_reads(params, token)
        libFile = self.generate_libfile(params[self.PARAM_IN_LIBFILE_ARGS], reads, outdir)
        a5_output_prefix = params[self.PARAM_IN_CS_NAME]

        self.exec_A5(libFile, params, outdir)
        self.log('A5 output dir: ' + a5_output_prefix)

        # parse the output and save back to KBase

        output_contigs = os.path.join(outdir, a5_output_prefix + ".contigs.fasta")

        min_contig_len = 0

        if self.PARAM_IN_MIN_CONTIG in params and params[self.PARAM_IN_MIN_CONTIG] is not None:
            if (int(params[self.PARAM_IN_MIN_CONTIG])) > 0:
                min_contig_len = int(params[self.PARAM_IN_MIN_CONTIG])

        self.log('Uploading FASTA file to Assembly')
        assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='dev')

        assemblyUtil.save_assembly_from_fasta({'file': {'path': output_contigs},
                                               'workspace_name': wsname,
                                               'assembly_name': params[self.PARAM_IN_CS_NAME],
                                               'min_contig_length': min_contig_len
                                               })

        report_name, report_ref = self.load_report(output_contigs, params, wsname)

        output = {'report_name': report_name,
                  'report_ref': report_ref
                  }

        #END run_A5

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_A5 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 29
0
    def filter_contigs(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Filter Contigs function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError('Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError('Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError('Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError('Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')')


        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref})


        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')


        # Step 4 - Save the new Assembly back to the system
        print('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({'file': {'path': filtered_fasta_file},
                                                              'workspace_name': workspace_name,
                                                              'assembly_name': fasta_file['assembly_name']
                                                              })


        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
            'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})


        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  'assembly_output': new_assembly,
                  'n_initial_contigs': n_total,
                  'n_contigs_removed': n_total - n_remaining,
                  'n_contigs_remaining': n_remaining
                  }
        print('returning:' + pformat(output))
                
        #END filter_contigs
        

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 30
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fasta_as_assembly_from_staging(self, params):
        '''
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''
        log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        result_file_path = os.path.join(self.scratch, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in assembly_overview_data.iteritems():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([[str(e['contig_id']), e['length']]
                              for e in contig_data])

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', overview_content)
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': self.scratch,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        
        """
        uuid_string = str(uuid.uuid4())

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}
        object_data = self.dfu.get_objects(get_objects_params)
        objects_created = [{
            'ref': obj_ref,
            'description': 'Imported Assembly'
        }]

        output_html_files = self.generate_html_report(obj_ref, object_data,
                                                      params)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 270,
            'report_object_name': 'kb_upload_assembly_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Exemplo n.º 31
0
    def run_idba_ud(self, ctx, params):
        """
        Run IDBA on paired end libraries
        :param params: instance of type "idba_ud_Params" (Input parameters
           for running idba_ud. string workspace_name - the name of the
           workspace from which to take input and store output.
           list<paired_end_lib> read_libraries - Illumina PairedEndLibrary
           files to assemble. string output_contigset_name - the name of the
           output contigset min_contig_length - minimum length of contigs to
           output, default is 2000 @optional kval_args) -> structure:
           parameter "workspace_name" of String, parameter "read_libraries"
           of list of type "paired_end_lib" (The workspace object name of a
           PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile
           type.), parameter "output_contigset_name" of String, parameter
           "min_contig_length" of Long, parameter "kval_args" of type
           "kval_args_type" (Additional parameters: k values for idba_ud.
           (Note: The UI elements for these values have been removed, based
           on feedback)) -> structure: parameter "mink_arg" of Long,
           parameter "maxk_arg" of Long, parameter "step_arg" of Long
        :returns: instance of type "idba_ud_Output" (Output parameters for
           IDBA run. string report_name - the name of the KBaseReport.Report
           workspace object. string report_ref  - the workspace reference of
           the report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_idba_ud

        print("===================  IN run_idba_ud")

        print("PARAMS: ")
        pprint(params)
        print("============================   END OF PARAMS: ")

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_idba_ud with params:\n' + pformat(params))

        token = ctx['token']

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        obj_ids = []
        for r in params[self.PARAM_IN_LIB]:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = workspaceService(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=ctx['token'])

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false',
                'gzipped': None
            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        self.check_reads(reads, reftoname)

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            print("REF:" + str(ref))
            print("READS REF:" + str(reads[ref]))
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'paired':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'rev_file': f['rev'],
                    'type': 'paired',
                    'seq_tech': seq_tech
                })
            elif f['type'] == 'single':
                reads_data.append({
                    'fwd_file': f['fwd'],
                    'type': 'single',
                    'seq_tech': seq_tech
                })
            else:
                raise ValueError('Something is very wrong with read lib' +
                                 reads_name)

        # set the output location
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        outdir = os.path.join(self.scratch, 'IDBA_dir' + str(timestamp))

        idba_out = self.exec_idba_ud(reads_data, params, outdir)
        self.log('IDBA output dir: ' + idba_out)

        # parse the output and save back to KBase
        output_contigs = os.path.join(idba_out, 'contig.fa')

        self.log('Uploading FASTA file to Assembly')
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver='dev')
        if params.get('min_contig_length', 0) > 0:
            assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': output_contigs
                },
                'workspace_name':
                wsname,
                'assembly_name':
                params[self.PARAM_IN_CS_NAME],
                'min_contig_length':
                params['min_contig_length']
            })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(
                output_contigs + '.filtered.fa', params, wsname)
        else:
            assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': output_contigs
                },
                'workspace_name':
                wsname,
                'assembly_name':
                params[self.PARAM_IN_CS_NAME]
            })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(output_contigs, params,
                                                       wsname)

        output = {'report_name': report_name, 'report_ref': report_ref}

        #END run_idba_ud

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_idba_ud return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 32
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT.  Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           min_k - minimum kmer size (<= 127), must be odd number, default 21
           max_k - maximum kmer size (<= 127), must be odd number, default 99
           k_step - increment of kmer size of each iteration (<= 28), must be
           even number, default 10 k_list - list of kmer size (all must be
           odd, in the range 15-127, increment <= 28); override `--k-min',
           `--k-max' and `--k-step' min_contig_length - minimum length of
           contigs to output, default is 2000 @optional
           megahit_parameter_preset @optional min_count @optional k_min
           @optional k_max @optional k_step @optional k_list @optional
           min_contig_length) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            raise ValueError('Error running MEGAHIT, return code: ' +
                             str(retcode) + '\n')

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except QUASTError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok 
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except _RepError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok 
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 33
0
    def exec_megahit(self, ctx, params):
        """
        :param params: instance of type "ExecMegaHitParams" (exec_megahit()
           Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as
           Input Creates Assembly object(s) as output. Will eventually also
           create AssemblySet object if input is a ReadsSet and not running a
           combined assembly Other vars same as run_megahit()) -> structure:
           parameter "workspace_name" of String, parameter "input_reads_ref"
           of String, parameter "output_contigset_name" of String, parameter
           "combined_assembly_flag" of Long, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_len" of Long
        :returns: instance of type "ExecMegaHitOutput" -> structure:
           parameter "report_text" of String, parameter
           "output_contigset_ref" of list of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN exec_megahit
        console = []
        self.log(console, 'Running exec_megahit() with params=')
        self.log(console, "\n" + pformat(params))

        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'

        ### STEP 0: init
        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        ### STEP 1: basic parameter checks + parsing
        required_params = [
            'workspace_name', 'input_reads_ref', 'output_contigset_name'
        ]
        for required_param in required_params:
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 2: determine if input is a ReadsLibrary or ReadsSet
        input_reads_ref = params['input_reads_ref']
        input_reads_name = None
        try:
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_reads_ref
                }]})[0]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_info[TYPE_I])  # remove trailing version
            input_reads_name = input_reads_obj_info[NAME_I]

        except Exception as e:
            raise ValueError('Unable to get reads object from workspace: (' +
                             input_reads_ref + ')' + str(e))

        accepted_input_types = [
            "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary"
        ]
        if input_reads_obj_type not in accepted_input_types:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        if input_reads_obj_type == "KBaseSets.ReadsSet":
            required_param = 'combined_assembly_flag'
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 3: get the list of library references
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            readsSet_ref_list = [input_reads_ref]
            readsSet_names_list = [input_reads_name]

        elif input_reads_obj_type == "KBaseSets.ReadsSet":
            readsSet_ref_list = []
            readsSet_names_list = []

            try:
                setAPI_Client = SetAPI(
                    url=self.serviceWizardURL,
                    token=ctx['token'])  # for dynamic service
                #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # SDK local method
            except Exception as e:
                raise ValueError(
                    "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '"
                    + self.serviceWizardURL + "' token: '" + ctx['token'] +
                    "'" + str(e))
                #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e))

            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    input_reads_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(input_reads_ref) + ")\n" + str(e))

            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])

        else:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine
        if input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            self.log(
                console,
                "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES"
            )

            # make dir
            timestamp = int(
                (datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000)
            input_dir = os.path.join(self.scratch, 'input.' + str(timestamp))
            if self.mac_mode:  # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
                input_dir = os.path.join(self.host_scratch,
                                         'input.' + str(timestamp))
            if not os.path.exists(input_dir):
                os.makedirs(input_dir)

            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # start combined file
            read_buf_size = 65536
            write_buf_size = 65536
            combined_input_fwd_path = os.path.join(input_dir,
                                                   'input_reads_fwd.fastq')
            combined_input_rev_path = os.path.join(input_dir,
                                                   'input_reads_rev.fastq')
            combined_input_fwd_handle = open(combined_input_fwd_path, 'w',
                                             write_buf_size)
            combined_input_rev_handle = open(combined_input_rev_path, 'w',
                                             write_buf_size)

            # add libraries, one at a time
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']

                # append fwd
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                this_input_path = this_input_fwd_path
                cat_file_handle = combined_input_fwd_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

                # append rev
                this_input_path = this_input_rev_path
                cat_file_handle = combined_input_rev_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

            combined_input_fwd_handle.close()
            combined_input_rev_handle.close()

        ### STEP 5: finally run MegaHit_Sets
        exec_megahit_single_library_params = params
        output_assemblyset_contigset_paths = []
        output_contigset_path = None

        # PairedEndLibrary
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            self.log(
                console,
                "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: "
                + str(input_reads_ref))
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
                readsLibrary = readsUtils_Client.download_reads({
                    'read_libraries': [input_reads_ref],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get reads object from workspace: (' +
                    input_reads_ref + ")\n" + str(e))

            input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][
                'fwd']
            input_rev_path = readsLibrary['files'][input_reads_ref]['files'][
                'rev']
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet combined (already downloaded and combined fastqs)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            input_fwd_path = combined_input_fwd_path
            input_rev_path = combined_input_rev_path
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet uncombined (still have to download)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] == 0:
            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # get libraries, one at a time, and run MegaHit_Sets
            output_assemblyset_contigset_paths = []
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']
                exec_megahit_single_library_params[
                    'input_fwd_path'] = this_input_fwd_path
                exec_megahit_single_library_params[
                    'input_rev_path'] = this_input_rev_path

                # the key line
                this_output_contigset_path = self.exec_megahit_single_library(
                    exec_megahit_single_library_params)
                output_assemblyset_contigset_paths.append(
                    this_output_contigset_path)

                os.remove(this_input_fwd_path)  # files can be really big
                os.remove(this_input_rev_path)

        # just in case we've confused ourselves
        else:
            raise ValueError("error in logic")

        ### STEP 6: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver=SERVICE_VER)
        output_contigset_refs = []
        output_contigset_names = []
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):
            if len(output_assemblyset_contigset_paths) == 1:
                assembly_name = params['output_contigset_name']
            else:
                assembly_name = readsSet_names_list[i] + '-' + params[
                    'output_contigset_name']

            this_output_data_ref = assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': this_output_contigset_path
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                assembly_name
            })

            output_contigset_refs.append(this_output_data_ref)
            output_contigset_names.append(assembly_name)

        ### STEP 7: generate the report text

        # compute a simple contig length distribution for the report
        report = ''
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):

            report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[
                i] + "\n"
            report += "-------------------------------------------------------------\n"
            report += "\n"
            lengths = []
            for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'):
                lengths.append(len(seq_record.seq))

                report += 'ContigSet saved to: ' + params[
                    'workspace_name'] + '/' + output_contigset_names[i] + '\n'
                report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
                report += 'Avg Length: ' + str(
                    sum(lengths) / float(len(lengths))) + ' bp.\n'

                bins = 10
                counts, edges = np.histogram(lengths, bins)
                report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
                for c in range(bins):
                    report += '   ' + str(counts[c]) + '\t--\t' + str(
                        edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        ### STEP 8: contruct the output to send back
        output = {
            'report_text': report,
            'output_contigset_refs': output_contigset_refs
        }

        #END exec_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method exec_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]