Пример #1
0
    def generate_report(self, input_file_name, params, out_folder, wsname):
        self.log('Generating and saving report')

        fasta_stats = self.load_stats(input_file_name)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = params[self.PARAM_IN_WS] + '/' + params[
            self.PARAM_IN_CS_NAME]

        report = ''
        report += 'Velvet results saved to: ' + wsname + '/' + out_folder + '\n'
        report += 'Assembly saved to: ' + assembly_ref + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)  # @UndefinedVariable
        report += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(
                edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'
        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        quastret = kbq.run_QUAST({
            'files': [{
                'path': input_file_name,
                'label': params[self.PARAM_IN_CS_NAME]
            }]
        })
        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        report_info = kbr.create_extended_report({
            'message':
            report,
            'objects_created': [{
                'ref': assembly_ref,
                'description': 'Assembled contigs'
            }],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': quastret['shock_id'],
                'name': 'report.html',
                'label': 'QUAST report'
            }],
            'report_object_name':
            'kb_velvet_report_' + str(uuid.uuid4()),
            'workspace_name':
            params[self.PARAM_IN_WS]
        })
        reportName = report_info['name']
        reportRef = report_info['ref']
        return reportName, reportRef
Пример #2
0
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token)
        self.au = AssemblyUtil(self.callback_url, token=self.token)
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir
        self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir)
Пример #3
0
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release')
        self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release')
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir

        self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION']
Пример #4
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           k_min - minimum kmer size (<= 255), must be odd number, defaults
           to 21 k_max - maximum kmer size (<= 255), must be odd number,
           defaults to 141 k_step - increment of kmer size of each iteration
           (<= 28), must be even number, defaults to 10 k_list - list of kmer
           sizes (all must be odd, in the range 15-255, increment <= 28);
           override using `--k-min', `--k-max' and `--k-step'
           min_contig_length - minimum length of contigs to output, default
           is 2000 max_mem_percent - maximum memory to make available to
           MEGAHIT, as a percentage of available system memory (optional,
           default = 0.9 or 90%) @optional megahit_parameter_preset @optional
           min_count @optional k_min @optional k_max @optional k_step
           @optional k_list @optional min_contig_length @optional
           max_mem_percent) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long, parameter "max_mem_percent"
           of Double
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # Set the number of CPUs to the number of cores minus 1
        megahit_cmd.append('--num-cpu-threads')
        megahit_cmd.append(str(max([(multiprocessing.cpu_count() - 1), 1])))

        # set mem usage
        # Note: this just sets the default value - 90% of available system memory allocated
        # to the container. Exposing it here as a place to later expose as a parameter.
        max_mem_percent = params.get('max_mem_percent', 0.9)
        megahit_cmd.append('-m')
        megahit_cmd.append(str(max_mem_percent))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            error_str = report_megahit_error(output_dir, retcode)
            raise RuntimeError(error_str)

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except ServerError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except ServerError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Пример #5
0
    def finish_run(self, params):
        """
        Finish up the run by uploading output and
        creating the report
        """
        console = []
        self.log(console, 'Running post')

        # run hipmer, capture output as it happens
        self.log(console, 'running hipmer:')

        # grab path of output contigs
        output_contigs = ''
        for root, subdirs, files in os.walk(self.scratch):
            for f in files:
                if f == 'final_assembly.fa':
                    output_contigs = os.path.join(root,f)
                    print("found OUTPUT CONTIGS {}".format(output_contigs))
                    continue

        output_name = params['output_contigset_name']
        slurm_out = os.path.join(self.scratch, 'slurm.out')

        if not os.path.exists(output_contigs):
            self.log(console, "It looks like HipMER failed. Could not find the output contigs.")
            self.log(console, "Show errors in log file")
            with open(slurm_out, 'r') as f:
                for line in f:
                    if line.lower().find('error') >= 0:
                        self.log(console, line)
            raise RuntimeError("Error in HipMER execution")

        wsname = params['workspace_name']

        self.log(console, 'Filtering short length contigs from HipMer assembly')

        assemblyUtil = AssemblyUtil(self.callbackURL, token=self.token)

        assembly_size_filter = params['assembly_size_filter']

        filtered_fasta_file_path = self.filter_contigs_by_length(output_contigs, assembly_size_filter)

        if os.stat(filtered_fasta_file_path).st_size == 0:
            raise ValueError("Error: Using input parameters, you have filtered all contigs from the HipMer \
                             assembly. Decrease the minimum contig size and try again.")
        else:
            output_contigs = filtered_fasta_file_path

        self.log(console, 'Uploading FASTA file to Assembly')

        save_input = {'file': {'path': output_contigs},
                      'workspace_name': wsname,
                      'assembly_name': output_name
                      }

        output_data_ref = assemblyUtil.save_assembly_from_fasta(save_input)

        # create a Report
        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/'
        report += params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   \%d\t--\t%d' % (counts[c], edges[c])
            report += ' to %d bp\n' % (edges[c + 1])

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except Exception as e:
            # not really any way to test this, all inputs have been checked
            # earlier and should be ok
            print('Logging exception from running QUAST')
            print((str(e)))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref,
                                      'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except Exception as e:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print((str(e)))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref']
                  }
        return output
Пример #6
0
    def generate_report(self, console, warnings, fa_file_name, params, out_dir,
                        wsname):
        """
        Generating and saving report
        """
        self.log(console, 'Generating and saving report')

        fa_file_with_path = os.path.join(out_dir, fa_file_name)
        [length_stats, coverage_stats,
         circ_stats] = self.load_stats(console, fa_file_with_path)
        lengths = [length_stats[contig_id] for contig_id in length_stats]

        assembly_ref = wsname + '/' + params['output_contigset_name']

        report_text = ''
        report_text += 'Unicycler results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) +
                            ' to ' + str(edges[c + 1]) + ' bp\n')
        self.log(console, 'Running QUAST')
        kbq = kb_quast(self.callbackURL)
        quastret = kbq.run_QUAST({
            'files': [{
                'path': fa_file_with_path,
                'label': params['output_contigset_name']
            }]
        })
        # self.log(console,'quastret = '+pformat(quastret))

        # delete assembly file to keep it out of zip
        os.remove(fa_file_with_path)

        # check starting genes
        in_start = False
        ic = iter(console)
        for line in ic:
            if line.startswith('Rotating completed replicons'):
                while not line.startswith('Assembly complete'):
                    line = next(ic)
                    # self.log(console,'debug line = '+line)
                    fields = line.strip().split()
                    if len(fields) > 3 and fields[
                            0] in circ_stats and circ_stats[fields[0]] == 'Y':
                        if fields[3] == 'none':
                            fields[3] = 'none found'
                        circ_stats[fields[0]] = 'Y, ' + fields[3]

        # check circularization and make data table for report
        contig_data = []
        for contig_id in length_stats:
            contig_data.append({
                'contig_id': contig_id,
                'circular': circ_stats[contig_id],
                'coverage': coverage_stats[contig_id],
                'length': length_stats[contig_id]
            })

        # self.log(console, 'contig_data = '+pformat(contig_data))

        # move quast output into main out_dir
        move(os.path.join(quastret['quast_path'], 'report.html'),
             os.path.join(out_dir, 'quast_report.html'))

        output_files = self.generate_output_file_list(console, out_dir)

        # render template
        template_file = 'unicycler_tabs.tt'
        tmpl_data = {
            'page_title':
            'Unicycler Report',
            'data_array':
            contig_data,
            'cols': [{
                'data': 'contig_id',
                'title': 'Contig ID'
            }, {
                'data': 'circular',
                'title': 'Circular, Starting Gene'
            }, {
                'data': 'coverage',
                'title': 'Coverage (x)'
            }, {
                'data': 'length',
                'title': 'Length (bp)'
            }]
        }
        # tmpl_data['quast_output'] = '<iframe>'+self.read_html(os.path.join(quastret['quast_path'],'report.html'))+'</iframe>'
        # tmpl_data['quast_output'] = '<iframe frameborder="0" width="100%" height="100%" src="'+os.path.join(quastret['quast_path'],'report.html')+'"></iframe>'
        tmpl_data[
            'quast_output'] = '<iframe style="display:block; width:100%; height:100vh; border:none;" src="quast_report.html"></iframe>'
        tmpl_data['tmpl_vars'] = json.dumps(tmpl_data,
                                            sort_keys=True,
                                            indent=2)
        tmpl_data['template_content'] = self.read_template(template_file)
        tmpl_data['unicycler_log'] = '<p><pre>' + '<br>'.join(
            filter(
                lambda line: not (line.startswith('tput') or line.lstrip().
                                  startswith('0 / ')), console)) + '</pre></p>'

        # save report
        self.log(console, 'Saving report')
        report_file = 'unicycler_report.html'

        # copy the templates into 'scratch', where they can be accessed by KBaseReport
        try:
            copytree(os.path.join(self.appdir, 'templates'),
                     os.path.join(self.scratch, 'templates'))
        except Exception as e:
            self.log(console, 'Exception copying tree. ' + str(e))

        reportClient = KBaseReport(self.callbackURL)
        template_output = reportClient.render_template({
            'template_file':
            os.path.join(self.scratch, 'templates', template_file),
            'template_data_json':
            json.dumps(tmpl_data),
            'output_file':
            os.path.join(out_dir, report_file)
        })

        report_output = reportClient.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref': assembly_ref,
                'description': 'Assembled contigs'
            }],
            'direct_html_link_index':
            0,
            'file_links':
            output_files,
            'html_links': [{
                'path': out_dir,
                'name': report_file,
                'label': 'Unicycler report',
                'description': 'description of template report'
            }],
            'warnings':
            warnings,
            'report_object_name':
            'kb_unicycler_report_' + str(uuid.uuid4()),
            'workspace_name':
            params['workspace_name']
        })

        return report_output['name'], report_output['ref']