Exemplo n.º 1
0
    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass
Exemplo n.º 2
0
    def _stage_assembly_files(self, object_list):
        """
        _stage_assembly_files: download the fasta files to the scratch area
        return list of file names
        """
        log('Processing assembly object list: {}'.format(object_list))

        # Sourmash uses the sequence filename as the default label for the signatures
        # this includes the complete file path. So keeping the sequence file name as close
        # to the desired label as possible is the reason not to place each file under
        # a 'fasta' directory or inlude the '.fa' file extension

        auc = AssemblyUtil(self.callbackURL)
        staged_file_list = []

        for assembly_upa in object_list:
            try:
                file_ = auc.get_assembly_as_fasta({'ref':
                                                   assembly_upa})['path']
            except AssemblyUtilError as assembly_error:
                print(str(assembly_error))
                raise
            filename = os.path.basename(file_).replace('.fa', '')
            to_upper_command = "awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' " \
                               + file_ + ' > tmp.fa ' + '&& mv tmp.fa ' + filename
            self._run_command(to_upper_command)
            staged_file_list.append(filename)

        log('Created file list: {}'.format(staged_file_list))
        return staged_file_list
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     genome_name = "Genome.1"
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': assembly_ref,
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': None,
                                               'genus': '',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
    def getBogusAssembly(self):
        # Create a fake assembly with lots of contigs

        assembly_file_name = "bogus.fna"  # "AP009048.fna"
        assembly_temp_file = os.path.join("/kb/module/work/tmp",
                                          assembly_file_name)
        with open(assembly_temp_file, "w") as f:
            for i in range(1, 30002):
                f.write("> contig_%d\n" % i)
                f.write(
                    "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n"
                )

        assembly_name = "Assembly.2"
        au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"],
                          token=self.getContext()["token"])
        assembly_ref = au.save_assembly_from_fasta({
            "file": {
                "path": assembly_temp_file
            },
            "workspace_name":
            self.getWsName(),
            "assembly_name":
            assembly_name
        })
        self.assembly_ref = assembly_ref
        print("Uploaded bogus assembly " + str(assembly_ref))
        return assembly_ref
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.scratch = config['scratch']
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.dfu = AssemblyUtil(self.callback_url)
     #END_CONSTRUCTOR
     pass
Exemplo n.º 6
0
    def load_test_genome_direct(self, filename, assembly_filename, obj_name):
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            obj_name + '.assembly',
            'file': {
                'path': assembly_filename
            }
        })
        pprint('created test assembly: ' + assembly_ref)

        with open(filename, 'r') as file:
            data_str = file.read()
        data = json.loads(data_str)
        data['assembly_ref'] = assembly_ref
        # save to ws
        save_info = {
            'workspace':
            self.getWsName(),
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': obj_name + '.genome'
            }]
        }
        result = self.ws.save_objects(save_info)
        info = result[0]
        ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        print('created test genome: ' + ref + ' from file ' + filename)
        return ref
Exemplo n.º 7
0
    def _build_index(self, assembly_info, validated_params):
        # get the assembly as a fasta file using AssemblyUtil
        au = AssemblyUtil(self.callback_url)
        fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']})

        # make the target destination folder (check again it wasn't created yet)
        if os.path.exists(validated_params['output_dir']):
            raise('Output directory name specified (' + validated_params['output_dir'] +
                  ') already exists. Will not overwrite, so aborting.')
        os.makedirs(validated_params['output_dir'])

        # configure the command line args and run it
        cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params)
        self.bowtie2.run('bowtie2-build', cli_params)
        index_info = {'output_dir': validated_params['output_dir'],
                      'index_files_basename': fasta_info['assembly_name']}

        # cache the result, mark if it worked or not
        cache_success = self._put_cached_index(assembly_info,
                                               fasta_info['assembly_name'],
                                               validated_params['output_dir'],
                                               validated_params['ws_for_cache'])
        if cache_success:
            index_info['pushed_to_cache'] = 1
        else:
            index_info['pushed_to_cache'] = 0

        return index_info
Exemplo n.º 8
0
    def get_fasta_file(self, genome_ref):
        ws = Workspace(self.ws_url)
        # test if genome references an assembly type
        # do get_objects2 without data. get list of refs
        genome_obj_info = ws.get_objects2({
            'objects': [{
                'ref': genome_ref
            }],
            'no_data': 1
        })
        # get the list of genome refs from the returned info.
        # if there are no refs (or something funky with the return), this will be an empty list.
        # this WILL fail if data is an empty list. But it shouldn't be, and we know because
        # we have a real genome reference, or get_objects2 would fail.
        genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', [])

        # see which of those are of an appropriate type (ContigSet or Assembly), if any.
        assembly_ref = list()
        ref_params = [{'ref': x} for x in genome_obj_refs]
        ref_info = ws.get_object_info3({'objects': ref_params})
        for idx, info in enumerate(ref_info.get('infos')):
            if "KBaseGenomeAnnotations.Assembly" in info[
                    2] or "KBaseGenomes.ContigSet" in info[2]:
                assembly_ref.append(";".join(ref_info.get('paths')[idx]))
        # now just get the file.
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        return fasta_file["path"]
Exemplo n.º 9
0
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     # Add a genome to the WS to test ref_paths
     genome_name = "Genome.1"
     genome = {'id': 'Unknown', 'features': [],
               'scientific_name': "",
               'domain': "", 'genetic_code': 0,
               'assembly_ref': assembly_ref,
               'cdss': [], 'mrnas': [],
               'source': 'Magic!',
               'gc_content': 0, 'dna_size': 0,
               'reference_annotation': 0}
     prov = self.getContext().provenance()
     ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'])
     info = ga.save_one_genome_v1(
         {'workspace': self.getWsName(), 'name': genome_name,
          'data': genome, 'provenance': prov})['info']
     genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': "{};{}".format(genome_ref, assembly_ref),
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': 0,
                                               'genus': 'genus',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
     genome_ref = self.getWsName() + "/" + genome_name
     genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data']
     features_to_work = {}
     for feature in genome['features']:
         features_to_work[feature['id']] = feature['location']
     aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 
                                             'assembly_ref': genome['assembly_ref']})['dna_sequences']
     bad_dnas = 0
     for feature in genome['features']:
         if feature['dna_sequence'] != dna_sequences[feature['id']]:
             bad_dnas += 1
     self.assertEqual(bad_dnas, 0)
Exemplo n.º 10
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.shock_url = config['shock-url']
     self.dfu = DataFileUtil(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.setapi = SetAPI(self.callback_url)
     self.wss = workspaceService(config['workspace-url'])
Exemplo n.º 11
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = os.path.join(config['scratch'],
                                 'import_assembly_' + str(uuid.uuid4()))
     handler_utils._mkdir_p(self.scratch)
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.uploader_utils = UploaderUtil(config)
 def load_fasta_file(self, filename, obj_name, contents):
     f = open(filename, 'w')
     f.write(contents)
     f.close()
     assemblyUtil = AssemblyUtil(self.callback_url)
     assembly_ref = assemblyUtil.save_assembly_from_fasta({'file': {'path': filename},
                                                           'workspace_name': self.getWsName(),
                                                           'assembly_name': obj_name
                                                           })
     return assembly_ref
Exemplo n.º 13
0
    def run_mash_sketch(self, ctx, params):
        """
        Generate a sketch file from a fasta/fastq file
        :param params: instance of type "MashSketchParams" (* * Pass in **one
           of** input_path, assembly_ref, or reads_ref *   input_path -
           string - local file path to an input fasta/fastq *   assembly_ref
           - string - workspace reference to an Assembly type *   reads_ref -
           string - workspace reference to a Reads type * Optionally, pass in
           a boolean indicating whether you are using paired-end reads. *
           paired_ends - boolean - whether you are passing in paired ends) ->
           structure: parameter "input_path" of String, parameter
           "assembly_ref" of String, parameter "reads_ref" of String,
           parameter "paired_ends" of type "boolean" (params:
           input_upa: workspace reference to an assembly object
           workspace_name: name of current workspace search_db: database to
           search n_max_results: number of results to return, integer between
           1 and 100)
        :returns: instance of type "MashSketchResults" (* * Returns the local
           scratch file path of the generated sketch file. * Will have the
           extension '.msh') -> structure: parameter "sketch_path" of String
        """
        # ctx is the context object
        # return variables are: results
        #BEGIN run_mash_sketch
        if 'reads_ref' in params:
            reads_utils = ReadsUtils(self.callbackURL)
            result = reads_utils.download_reads({
                'read_libraries': [params['reads_ref']],
                'interleaved':
                'true'
            })
            input_path = result['files'][params['reads_ref']]['files']['fwd']
        elif 'assembly_ref' in params:
            assembly_util = AssemblyUtil(self.callbackURL)
            result = assembly_util.get_assembly_as_fasta(
                {'ref': params['assembly_ref']})
            input_path = result['path']
        elif 'input_path' in params:
            input_path = params['input_path']
        else:
            raise ValueError(
                'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.'
            )
        mash_utils = MashUtils(self.config, self.auth_token)
        output_file_path = mash_utils.mash_sketch(
            input_path, paired_ends=params.get('paired_ends'))
        results = {'sketch_path': output_file_path}
        #END run_mash_sketch

        # At some point might do deeper type checking...
        if not isinstance(results, dict):
            raise ValueError('Method run_mash_sketch return value ' +
                             'results is not type dict as required.')
        # return the results
        return [results]
Exemplo n.º 14
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('SetAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        authServiceUrl = cls.cfg.get('auth-service-url',
                "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'SetAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = SetAPI(cls.cfg)

        # setup data at the class level for now (so that the code is run
        # once for all tests, not before each test case.  Not sure how to
        # do that outside this function..)
        suffix = int(time.time() * 1000)
        wsName = "test_SetAPI_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': wsName})
#        wsName = 'pranjan77:1477441032423'
        cls.wsName = wsName
        # copy test file to scratch area
        fna_filename = "seq.fna"
        fna_path = os.path.join(cls.cfg['scratch'], fna_filename)
        shutil.copy(os.path.join("data", fna_filename), fna_path)

        ru = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        ws_obj_name = 'MyNewAssembly'
        cls.assembly1ref = ru.save_assembly_from_fasta( 
            {
                'file':{'path':fna_path},
                'workspace_name':wsName,
                'assembly_name':'assembly_obj_1'
            })
        cls.assembly2ref = ru.save_assembly_from_fasta( 
            {
                'file':{'path':fna_path},
                'workspace_name':wsName,
                'assembly_name':'assembly_obj_2'
            })
Exemplo n.º 15
0
 def load_fasta_file(self, path, name):
     assembly_util = AssemblyUtil(self.callback_url)
     return assembly_util.save_assembly_from_fasta({
         'file': {
             'path': path
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         name
     })
Exemplo n.º 16
0
 def get_fasta_file(self, filename, obj_name):
     assemblyUtil = AssemblyUtil(self.callback_url)
     assembly_ref = assemblyUtil.save_assembly_from_fasta({
         'file': {
             'path': filename
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         obj_name
     })
     return assembly_ref
Exemplo n.º 17
0
 def loadAssembly(self):
     if hasattr(self.__class__, 'assembly_ref'):
         return self.__class__.assembly_ref
     fasta_path = os.path.join(self.scratch, 'test.fna')
     shutil.copy(os.path.join('data', 'test.fna'), fasta_path)
     au = AssemblyUtil(self.callback_url)
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': fasta_path},
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': 'test_assembly'
                                                 })
     self.__class__.assembly_ref = assembly_ref
     return assembly_ref
Exemplo n.º 18
0
 def save_assembly(self, wsname, output_contigs, token, name, console):
     self.log(console, 'Uploading FASTA file to Assembly')
     assemblyUtil = AssemblyUtil(self.callbackURL,
                                 token=token,
                                 service_ver='dev')
     assemblyUtil.save_assembly_from_fasta({
         'file': {
             'path': output_contigs
         },
         'workspace_name': wsname,
         'assembly_name': name
     })
Exemplo n.º 19
0
 def get_assembly(self, target_dir, assembly_upa):
     auc = AssemblyUtil(self.callbackURL)
     filename = os.path.join(target_dir, assembly_upa.replace('/', '_'))
     try:
         auc.get_assembly_as_fasta({
             'ref': assembly_upa,
             'filename': filename
         })
     except AssemblyUtilError as assembly_error:
         print(str(assembly_error))
         raise
     return filename
Exemplo n.º 20
0
    def test_filter_contigs_by_length_01(self):
        method = 'filter_contigs_by_length_01'

        print("\n\nRUNNING: test_filter_contigs_by_length_01()")
        print("===========================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        ass_file_1 = 'assembly_1.fa'
        ass_file_2 = 'assembly_2.fa'
        ass_path_1 = os.path.join(self.scratch, ass_file_1)
        ass_path_2 = os.path.join(self.scratch, ass_file_2)
        shutil.copy(os.path.join("data", ass_file_1), ass_path_1)
        shutil.copy(os.path.join("data", ass_file_2), ass_path_2)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            'assembly_1'
        })
        ass_ref_2 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_2
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            'assembly_2'
        })

        # run method
        input_refs = [ass_ref_1, ass_ref_2]
        base_output_name = method + '_output'
        params = {
            'workspace_name': self.getWsName(),
            'input_assembly_refs': input_refs,
            'min_contig_length': 1000,
            'output_name': 'test_filtered'
        }
        result = self.getImpl().run_filter_contigs_by_length(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
Exemplo n.º 21
0
def loadFasta2Assembly(self, filename):
    fn, ext = os.path.splitext(filename)
    fasta_path = os.path.join(self.scratch, filename)
    shutil.copy(os.path.join('../testReads', filename), fasta_path)
    au = AssemblyUtil(self.callback_url)
    a_ref = au.save_assembly_from_fasta({
        'file': {
            'path': fasta_path
        },
        'workspace_name': self.getWsName(),
        'assembly_name': fn
    })
    return a_ref
Exemplo n.º 22
0
def fetch_fasta_from_assembly(assembly_ref, ws_url, callback_url):
    """
    From an assembly or contigset, this uses a data file util to build a FASTA file and return the
    path to it.
    """
    allowed_types = ['KBaseFile.Assembly',
                     'KBaseGenomeAnnotations.Assembly',
                     'KBaseGenomes.ContigSet']
    if not check_ref_type(assembly_ref, allowed_types, ws_url):
        raise ValueError("The reference {} cannot be used to fetch a FASTA file".format(
            assembly_ref))
    au = AssemblyUtil(callback_url)
    return au.get_assembly_as_fasta({'ref': assembly_ref})
Exemplo n.º 23
0
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None
Exemplo n.º 24
0
def get_fasta_from_genome(logger,ws_client,urls,genome_id):
    
    ref = ws_client.get_object_subset(
                                     [{ 'ref' : genome_id ,'included': ['contigset_ref']}])
    contig_id = ref[0]['data']['contigset_ref']
    logger.info( "Generating FASTA from Genome")
    try:
         ## get the FASTA
         assembly = AssemblyUtil(urls['callback_url'])
         ret = assembly.get_assembly_as_fasta({'ref':contig_id})
         output_file = ret['path']
         fasta_file = os.path.basename(output_file)
    	 return fasta_file
    except Exception, e:
	 raise Exception(e)
	 raise Exception("Unable to Create FASTA file from Genome : {0}".format(genome_id))
Exemplo n.º 25
0
 def load_fasta_file(self, filename, obj_name, contents):
     f = open(filename, 'w')
     # TODO make this use the data folder (not sure of relative path)
     f.write(contents)
     f.close()
     assemblyUtil = AssemblyUtil(self.callback_url)
     # TODO why does this next line take forevverr
     assembly_ref = assemblyUtil.save_assembly_from_fasta({
         'file': {
             'path': filename
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         obj_name
     })
     return assembly_ref
Exemplo n.º 26
0
 def get_genome_ref(self, ws_name, tf='ecoliMG1655.fa'):
     if hasattr(self.__class__, 'genomeInfo'):
         return self.__class__.genomeInfo
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     target = os.path.join(self.scratch, tf)
     self.genome_path = target
     shutil.copy('data/' + tf, target)
     self.__class__.genomeInfo = au.save_assembly_from_fasta({
         'file': {
             'path': target
         },
         'workspace_name':
         ws_name,
         'assembly_name':
         tf.split('.fa')[0]
     })
     return self.__class__.genomeInfo
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'provenance': [
                            {'service': 'AssemblyAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('AssemblyAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = AssemblyAPI(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_maxbin_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.obj_name = "7989/489/2"
        cls.contigs = [u'NZ_ALQT01000016']

        # create an example Assembly
        cls.au = AssemblyUtil(cls.callback_url)
        assembly_filename = 'test.fa'
        cls.assembly_fasta_file_path = os.path.join(cls.scratch, assembly_filename)
        shutil.copy(os.path.join("data", assembly_filename), cls.assembly_fasta_file_path)

        assembly_params = {
            'file': {'path': cls.assembly_fasta_file_path},
            'workspace_name': cls.wsName,
            'assembly_name': 'MyAssembly'
        }
        cls.assembly_ref_1 = cls.au.save_assembly_from_fasta(assembly_params)
        print('Assembly1:' + cls.assembly_ref_1)

        # create a test legacy contigset
        with open('data/contigset1.json') as file:
            contigset_data = json.load(file)
        pprint(contigset_data)
        saveData = {
            'type': 'KBaseGenomes.ContigSet',
            'data': contigset_data,
            'name': 'contigset'
        }
        cls.contig_set_info = cls.wsClient.save_objects({'workspace': cls.wsName, 'objects': [saveData]})[0]
        pprint(cls.contig_set_info)
        cls.contig_set_ref = str(cls.contig_set_info[6]) + '/' + str(cls.contig_set_info[0]) + '/' + str(cls.contig_set_info[4])
        print('ContigSet1:' + cls.contig_set_ref)
Exemplo n.º 28
0
    def test_genbank_to_genome(self, download_staging_file,
                               update_staging_service):

        fasta_file = 'small_fasta.fna'
        ws_obj_name = 'MyAssembly'

        params = {
            'staging_file_subdir_path': fasta_file,
            'workspace_name': self.getWsName(),
            'assembly_name': ws_obj_name
        }

        ref = self.getImpl().import_fasta_as_assembly_from_staging(
            self.getContext(), params)
        self.assertTrue('obj_ref' in ref[0])
        self.assertTrue('report_ref' in ref[0])
        self.assertTrue('report_name' in ref[0])

        fasta_file_path = os.path.join('/kb/module/work/tmp', fasta_file)
        assemblyUtil = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        fasta_assembly = assemblyUtil.get_assembly_as_fasta(
            {'ref': self.getWsName() + "/{}".format(ws_obj_name)})

        expected_data = None
        with open(fasta_file_path, 'r') as f:
            expected_data = f.read()
        actual_data = None
        with open(fasta_assembly['path'], 'r') as f:
            actual_data = f.read()
        self.assertEqual(actual_data, expected_data)

        get_objects_params = {
            'object_refs': [ref[0].get('obj_ref')],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)
        base_count = object_data.get('data')[0].get('data').get('base_counts')
        dna_size = object_data.get('data')[0].get('data').get('dna_size')

        self.assertEqual(dna_size, 2520)

        expected_base_count = {'A': 700, 'C': 558, 'T': 671, 'G': 591}
        self.assertDictContainsSubset(base_count, expected_base_count)
        self.assertDictContainsSubset(expected_base_count, base_count)
Exemplo n.º 29
0
def load_fasta_file(callback_url, ws_name, filename, obj_name, contents):
    """
    Loads the given FASTA file into a workspace as an Assembly object.
    """
    f = open(filename, 'w')
    f.write(contents)
    f.close()
    assembly_util = AssemblyUtil(callback_url)
    assembly_ref = assembly_util.save_assembly_from_fasta({
        'file': {
            'path': filename
        },
        'workspace_name':
        ws_name,
        'assembly_name':
        obj_name
    })
    return assembly_ref
Exemplo n.º 30
0
 def test_annotate_contigs(self):
     assembly_file_name = "small.fna"  #"AP009048.fna"
     assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name)
     assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
     shutil.copy(assembly_test_file, assembly_temp_file)
     assembly_name = 'Assembly.1'
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 
                                                 'workspace_name': self.getWsName(),
                                                 'assembly_name': assembly_name})
     genome_name = "Genome.1"
     result = self.getImpl().annotate_contigs(self.getContext(),
                                              {'assembly_ref': assembly_ref,
                                               'output_workspace': self.getWsName(),
                                               'output_genome_name': genome_name,
                                               'evalue': None,
                                               'fast': 0,
                                               'gcode': 0,
                                               'genus': 'genus',
                                               'kingdom': 'Bacteria',
                                               'metagenome': 0,
                                               'mincontiglen': 1,
                                               'norrna': 0,
                                               'notrna': 0,
                                               'rawproduct': 0,
                                               'rfam': 1,
                                               'scientific_name': 'Super : diper - name;'
                                               })[0]
     rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data']
     self.assertTrue('text_message' in rep)
     print("Report:\n" + str(rep['text_message']))
     genome_ref = self.getWsName() + "/" + genome_name
     genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data']
     features_to_work = {}
     for feature in genome['features']:
         features_to_work[feature['id']] = feature['location']
     aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token'])
     dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 
                                             'assembly_ref': genome['assembly_ref']})['dna_sequences']
     bad_dnas = 0
     for feature in genome['features']:
         if feature['dna_sequence'] != dna_sequences[feature['id']]:
             bad_dnas += 1
     self.assertEqual(bad_dnas, 0)
    def upload_assembly(self):
        if not self.testobjref:

            print "upload_assembly start"
    
            indata = 'U00096.2.fa'#_first1000.
            ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata)
            print "ftarget " + ftarget
            ret = shutil.copy('../test_data/' + indata, ftarget)
    
            #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

            self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])

            if not self.testwsname:
                self.testwsname.append(self.create_random_string())
    
            print "upload_assembly self.testwsname[0] " + self.testwsname[0]
    
            try:
                ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  #test_ws_name
            except Exception as e:
                #print "ERROR"
                #print(type(e))
                #print(e.args)
                print(e)
                pass
    
            try:
                print "attempt upload"
                print "ftarget " + ftarget
                ref = self.assemblyUtilClient.save_assembly_from_fasta(
                    {
                     'workspace_name': self.testwsname[0],
                     'assembly_name': 'Ecolik12MG1655',
                     'file': {'path': ftarget}})
        
                print "upload_assembly"
                print ref
                #self.testobjref = []
                self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1')
                #self.testobjdata = []
                #self.testobjdata.append(self.dfu.get_objects(
                #    {'object_refs': [self.testobjref[0]]}))
        
                ##print self.testobjdata[0]
    
            except Exception as e:
                print e
                pass
    
            print "self.testobjref[0]"
            print self.testobjref
            print self.testobjref[0]
Exemplo n.º 32
0
    def stage_assembly_files(self, object_list):
        """
        _stage_assembly_files: download the fasta files to the scratch area
        return list of file names
        """
        log('Processing assembly object list: {}'.format(object_list))

        auc = AssemblyUtil(self.callbackURL)
        staged_file_list = []

        for assembly_upa in object_list:
            try:
                filename = auc.get_assembly_as_fasta({'ref': assembly_upa})['path']
            except AssemblyUtilError as assembly_error:
                print(str(assembly_error))
                raise

            staged_file_list.append(filename)

        log('Created file list: {}'.format(staged_file_list))
        return staged_file_list
Exemplo n.º 33
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT.  Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           min_k - minimum kmer size (<= 127), must be odd number, default 21
           max_k - maximum kmer size (<= 127), must be odd number, default 99
           k_step - increment of kmer size of each iteration (<= 28), must be
           even number, default 10 k_list - list of kmer size (all must be
           odd, in the range 15-127, increment <= 28); override `--k-min',
           `--k-max' and `--k-step' min_contig_length - minimum length of
           contigs to output, default is 2000 @optional
           megahit_parameter_preset @optional min_count @optional k_min
           @optional k_max @optional k_step @optional k_list @optional
           min_contig_length) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            raise ValueError('Error running MEGAHIT, return code: ' +
                             str(retcode) + '\n')

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except QUASTError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok 
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except _RepError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok 
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Exemplo n.º 34
0
    def do_assembly(self, assemblyRef, wsName):
        #try:
        #    assembly = wsClient.get_objects2({'objects': [{'ref': assembly_ref}]})
        #except:
        #    exc_type, exc_value, exc_traceback = sys.exc_info()
        #    lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        #    orig_error = ''.join('    ' + line for line in lines)
        #    raise ValueError('Error from workspace:\n' + orig_error)

        #print assembly#[200:]
        #print assembly['data']
        #print assembly['data'][0]
        #assembly['data'][0]['data']

        #fasta_handle_ref = assembly['data'][0]['data']['fasta_handle_ref']
        #print "fasta_handle_ref "+fasta_handle_ref
        #print type(fasta_handle_ref)

        #TODO create file here /kb/module/work
        #TODO set output file name
        print "SDK_CALLBACK_URL "+os.environ['SDK_CALLBACK_URL']
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        #assembly_input_ref = "16589/2/1"
        #filename = "test.fasta"
        #obj_name = "EcoliMG1655.f"
        #wsname = "example_assembly"

        param = dict()
        param['ref'] = assemblyRef#assembly_input_ref

        input_fasta_file = au.get_assembly_as_fasta(param)#{'ref': assembly_input_ref})

        #just_input_fasta_file = os.path.basename(input_fasta_file['path'])
        #print "input_fasta_file "+ str(input_fasta_file['path'])

        newtmp = "/kb/module/work/tmp/tmp_"+self.create_random_string()
        os.mkdir(newtmp)
        os.mkdir(newtmp+"/input")

        newfasta = newtmp +"/input/"+os.path.basename(input_fasta_file['path'])
        print "newfasta "+newfasta

        os.rename(input_fasta_file['path'], newfasta)

        args = ["wrapper_phage_contigs_sorter_iPlant.pl ", "--db 2 ","--fna ", newfasta," --wdir ",newtmp]

        print str(args)

        cmdstring = "".join(args)

        print "Executing"
        cmdProcess = subprocess.Popen(cmdstring, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
        print "Done "+str(cmdProcess)
        stdout, stderr = cmdProcess.communicate()
        print " stdout: " + stdout
        print " stderr: " + stderr

        #return [report]

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [],
            'text_message': stdout
        }
        # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],

        #report_info = report.create({'report': reportObj, 'workspace_name': wsName})

        #reportObj = {
        #    'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
        #    'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        #}
        #report = KBaseReport(self.callback_url)
        #report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})


        # contruct the output to send back
        #output = {'report_name': report_info['name'],
        #          'report_ref': report_info['ref']
        #          }
        #print('returning:' + pformat(output))

        print('Saving report')
        kbr = KBaseReport(self.callback_url, service_ver='dev')
        report = ''
        report += "cmdstring: " + str(cmdstring) + " stdout: " + str(stdout) + " stderr: " + str(stderr)

        virout = newtmp+"/"+"VIRSorter_global-phage-signal.csv"
        with open(virout, 'r') as myfile:
            data = myfile.read().replace('\n', '')

        print "wsName "+str(wsName)

        data = data.replace(",", "\t")
        data = data.replace("##", "\n##")
        report = report +"\n\n***** VirSorter output *****\n"+data
        report_data = {'message': report,
             'objects_created': None,
             'direct_html_link_index': None,
             'html_links': None,
             'report_object_name': 'kb_virsorter_' + str(uuid.uuid4()),
             'workspace_name': wsName
             }

        print "report_data"
        print str(report_data)
        report_info = kbr.create_extended_report(report_data
            )

        # 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}],
        # 'html_links': [{'shock_id': quastret['shock_id'],
        #                     'name': 'report.html',
        #                     'label': 'QUAST report'}
        #                    ],

        reportName = report_info['name']
        reportRef = report_info['ref']
        return reportName, reportRef
Exemplo n.º 35
0
    def arast_run(self, ctx, params, assembler, server='http://localhost:8000'):
        output = None

        console = []
        self.log(console,'Running run_{} with params='.format(assembler))
        self.log(console, pformat(params))

        #### do some basic checks
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_refs' not in params and 'read_library_names' not in params:
            raise ValueError('read_library_refs or read_library_names parameter is required')
        if 'read_library_refs' in params:
            if type(params['read_library_refs']) != list:
                raise ValueError('read_library_refs must be a list')
        if 'read_library_names' in params:
            if type(params['read_library_names']) != list:
                raise ValueError('read_library_names must be a list')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')
        min_contig_len = params.get('min_contig_len') or 300

        token = ctx['token']

        os.environ["KB_AUTH_TOKEN"] = token
        os.environ["ARAST_URL"] =  server

        ws = workspaceService(self.workspaceURL)
        ws_libs = []
        if 'read_library_refs' in params:
            for lib_ref in params['read_library_refs']:
                ws_libs.append({'ref': lib_ref})
        if 'read_library_names' in params:
            for lib_name in params['read_library_names']:
                ws_libs.append({'ref': params['workspace_name'] + '/' + lib_name})
        if len(ws_libs)==0:
            raise ValueError('At least one read library must be provided in read_library_refs or read_library_names')
        libs = ws.get_objects2({'objects': ws_libs})['data']

        wsid = libs[0]['info'][6]

        kbase_assembly_input = self.combine_read_libs(libs)
        tmp_data = self.create_temp_json(kbase_assembly_input)

        mode = ''
        cmd = ['ar-run', '--data-json', tmp_data]
        if assembler:
            cmd = cmd + ['-a', assembler]
            mode = 'assembler: ' + assembler
        elif 'pipeline' in params and params['pipeline']:
            cmd = cmd + ['-p', params['pipeline']]
            mode = 'assembly pipeline: ' + params['pipeline']
        else:
            cmd = cmd + ['-r', params.get('recipe', 'auto')]
            mode = 'assembly recipe: ' + params['recipe']

        logger.info('Start {}'.format(mode))
        logger.debug('CMD: {}'.format(' '.join(cmd)))

        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT, shell=False)

        out, err = p.communicate()
        logger.debug(out)

        if p.returncode != 0:
            raise ValueError('Error running ar_run, return code: {}\n'.format(p.returncode))

        job_id = None
        match = re.search('(\d+)', out)
        if match:
            job_id = match.group(1)
        else:
            raise ValueError('No integer job ID found: {}\n'.format(out))

        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)
        output_dir = os.path.join(self.scratch, 'output.'+str(timestamp))
        output_contigs = os.path.join(output_dir, 'contigs.fa')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        cmd = ['ar-get', '-j', job_id, '-w', '-l']
        logger.debug('CMD: {}'.format(' '.join(cmd)))
        ar_log = subprocess.check_output(cmd)

        self.log(console, ar_log)

        cmdstr = 'ar-get -j {} -w -p | ar-filter -l {} > {}'.format(job_id, min_contig_len, output_contigs)
        logger.debug('CMD: {}'.format(cmdstr))
        subprocess.check_call(cmdstr, shell=True)

        cmd = ['ar-get', '-j', job_id, '-w', '-r']
        logger.debug('CMD: {}'.format(' '.join(cmd)))
        ar_report = subprocess.check_output(cmd)

        self.log(console, "\nDONE\n")

        client = AssemblyUtil(self.callback_url)
        assembly_ref = client.save_assembly_from_fasta({
                        'file':{'path':output_contigs},
                        'workspace_name':params['workspace_name'],
                        'assembly_name':params['output_contigset_name']
               	})
        
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        if 'read_library_names' in params:
            provenance[0]['input_ws_objects']=[params['workspace_name']+'/'+x for x in params['read_library_names']]
        elif 'read_library_refs' in params:
            provenance[0]['input_ws_objects']=[x for x in params['read_library_refs']]


        os.remove(tmp_data)
        #shutil.rmtree(output_dir)

        # create a Report
        report = ''
        report += '============= Raw Contigs ============\n' + ar_report + '\n'

        report += '========== Filtered Contigs ==========\n'
        report += 'ContigSet saved to: '+params['workspace_name']+'/'+params['output_contigset_name']+'\n'
        report += 'Assembled into '+str(len(lengths)) + ' contigs.\n'
        report += 'Average Length: '+str(sum(lengths)/float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   '+str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c+1]) + ' bp\n'

        print report

        reportObj = {
            'objects_created':[{'ref':params['workspace_name']+'/'+params['output_contigset_name'], 'description':'Assembled contigs'}],
            'text_message': report
        }

        reportName = '{}.report.{}'.format(assembler, job_id)
        report_obj_info = ws.save_objects({
                'id': wsid,
                'objects': [
                    {
                        'type': 'KBaseReport.Report',
                        'data': reportObj,
                        'name': reportName,
                        'meta': {},
                        'hidden': 1,
                        'provenance': provenance
                    }
                ]
            })[0]

        output = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]) }

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return output
class kb_virsorterTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        user_id = requests.post(
            'https://kbase.us/services/authorization/Sessions/Login',
            data='token={}&fields=user_id'.format(token)).json()['user_id']
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_virsorter',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_virsorter'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = kb_virsorter(cls.cfg)

        cls.testobjref = []
        #cls.testobjdata = []
        cls.testwsname = []

    @classmethod
    def tearDownClass(cls):
        if hasattr(cls, 'wsName'):
            cls.wsClient.delete_workspace({'workspace': cls.wsName})
            print('Test workspace was deleted')

        if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0:
            try:
                print('Deleting workspace 2 ' + cls.testwsname[0])
                cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]})
                print('Test workspace 2 was deleted ' + cls.testwsname[0])
            except Exception as e:
                print e

        #if hasattr(cls, 'testobjdata'):
        #    try:
        #        print('Deleting shock data ' + str(len(cls.testobjdata)))
        #        print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0])))
        #        print('Deleting shock data ' + str(cls.testobjdata[0]))
        #        node = cls.testobjdata[0]['data'][0]['lib']['file']['id']
        #        cls.delete_shock_node(node)
        #        print('Test shock data was deleted')
        #    except Exception as e:
        #        print e

    def getWsClient(self):
        return self.__class__.wsClient

    def getWsName(self):
        if hasattr(self.__class__, 'wsName'):
            return self.__class__.wsName
        suffix = int(time.time() * 1000)
        wsName = "test_kb_virsorter_" + str(suffix)
        ret = self.getWsClient().create_workspace({'workspace': wsName})
        self.__class__.wsName = wsName
        return wsName

    def getImpl(self):
        return self.__class__.serviceImpl

    def getContext(self):
        return self.__class__.ctx
    
    
    def write_file(self, filename, content):
        tmp_dir = self.cfg['scratch']
        file_path = os.path.join(tmp_dir, filename)
        with open(file_path, 'w') as fh1:
            fh1.write(content)
        return file_path


    def delete_shock_node(self, node_id):
        header = {'Authorization': 'Oauth {0}'.format(cls.token)}
        requests.delete(cls.shockURL + '/node/' + node_id, headers=header,
                        allow_redirects=True)

    def ztest_aaa_upload_to_shock(self):

        print "upload ref data to shock staging"
        self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        #file_path =  self.write_file('Phage_gene_catalog.tar.gz', 'Test')

        input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz'
        source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name)

        tmp_dir = self.cfg['scratch']
        target_file_path = os.path.join(tmp_dir, input_file_name)

        print "file_path " + source_file_path+"\t"+target_file_path

        orig_size = os.path.getsize(source_file_path)

        shutil.copy(source_file_path, target_file_path)

        print "Testing "+target_file_path
        print(os.path.isfile(target_file_path))

        ret1 = self.dfUtil.file_to_shock(
            {'file_path': target_file_path})
        
        print str(ret1)
        shock_id = ret1['shock_id']
        
        print "shock_id "+shock_id
        file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz')

        #ret2 = self.dfUtil.shock_to_file(
        #    {'shock_id': shock_id, 'file_path': file_path2})[0]
        ret2 = self.dfUtil.shock_to_file(
            {'shock_id': shock_id, 'file_path': file_path2})

        print(ret2)

        file_name = ret2['node_file_name']
        attribs = ret2['attributes']
        self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz')
        self.assertEqual(ret2['file_path'], file_path2)
        self.assertEqual(ret2['size'], orig_size)
        self.assertIsNone(attribs)

        #self.delete_shock_node(shock_id)


    def create_random_string(self):
        N = 20
        return ''.join(
            random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N))

    def test_virsorter_ok(self):
        self.upload_assembly()


        if not self.testwsname:
            self.testwsname.append(self.create_random_string())

        print "upload_reads self.testwsname[0] " + self.testwsname[0]

        #try:
        #    ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  # test_ws_name
        #except Exception as e:
        #    # print "ERROR"
        #    # print(type(e))
        #    # print(e.args)
        #    print(e)
        #    pass

        print "self.testwsname "+ str(self.testwsname)
        params = {}
        params['assembly_ref'] =  str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref
        params['ws_name'] = self.testwsname[0]

        result = self.getImpl().run_virsorter(self.getContext(), params)
        print('RESULT run_virsorter:')
        pprint(result)

        #testresult = [
        #    {'blah': 'blah', 'bleh': 'bleh'}]

        testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}]


        self.assertEqual(sorted(result), sorted(testresult))


    def upload_assembly(self):
        if not self.testobjref:

            print "upload_assembly start"
    
            indata = 'U00096.2.fa'#_first1000.
            ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata)
            print "ftarget " + ftarget
            ret = shutil.copy('../test_data/' + indata, ftarget)
    
            #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

            self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])

            if not self.testwsname:
                self.testwsname.append(self.create_random_string())
    
            print "upload_assembly self.testwsname[0] " + self.testwsname[0]
    
            try:
                ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  #test_ws_name
            except Exception as e:
                #print "ERROR"
                #print(type(e))
                #print(e.args)
                print(e)
                pass
    
            try:
                print "attempt upload"
                print "ftarget " + ftarget
                ref = self.assemblyUtilClient.save_assembly_from_fasta(
                    {
                     'workspace_name': self.testwsname[0],
                     'assembly_name': 'Ecolik12MG1655',
                     'file': {'path': ftarget}})
        
                print "upload_assembly"
                print ref
                #self.testobjref = []
                self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1')
                #self.testobjdata = []
                #self.testobjdata.append(self.dfu.get_objects(
                #    {'object_refs': [self.testobjref[0]]}))
        
                ##print self.testobjdata[0]
    
            except Exception as e:
                print e
                pass
    
            print "self.testobjref[0]"
            print self.testobjref
            print self.testobjref[0]