def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
    def _generate_report(self, expression_matrix_ref, workspace_name):
        """
        _generate_report: generate report
        """

        objects_created = [{'ref': expression_matrix_ref,
                            'description': 'Average ExpressionMatrix'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         # 'html_links': output_html_files,
                         # 'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_ave_expr_matrix_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
예제 #5
0
    def run_CGView(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_CGView
        print('Starting run_kellyhuangCGView function. Params=')
        print(params)
        # Validating workspace_name and input_file is present
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'input_file' not in params:
            raise ValueError(
                'Parameter input_file is not set in input arguments')

        input_file = params['input_file']

        # Set up CCT project_folder
        subprocess.call(
            "cd /opt/cgview_comparison_tool && ./update_cogs.sh && cgview_comparison_tool.pl -p project",
            shell=True)

        # Turn genome object to Genbank file
        gfu = GenomeFileUtil(self.callback_url)
        gbk = gfu.genome_to_genbank({'genome_ref': input_file})
        gbk_file = gbk["genbank_file"]["file_path"]
        subprocess.call([
            "cp", gbk_file,
            "/opt/cgview_comparison_tool/project/reference_genome"
        ])
        base = ntpath.basename(gbk_file).rsplit(".", 1)[0]
        name_gbff = base + ".gbff"
        name_gbk = base + ".gbk"
        from_path = "/opt/cgview_comparison_tool/project/reference_genome/" + name_gbff
        print("===== from", from_path)
        to_path = "/opt/cgview_comparison_tool/project/reference_genome/" + name_gbk
        print("===== to", to_path)
        subprocess.call(["mv", from_path, to_path])

        # Add Genbank file to project_folder/reference_genome
        # Generate map from Genbank file
        # subprocess.call("cgview_comparison_tool.pl -p project", shell=True)
        os.chdir("/opt/cgview_comparison_tool")
        proc = subprocess.Popen([
            "cgview_comparison_tool.pl", "-p",
            "/opt/cgview_comparison_tool/project"
        ],
                                stdout=subprocess.PIPE)
        # for line in proc.stdout:
        #     print(line)
        proc.wait()
        subprocess.call(["cgview_comparison_tool.pl", "-p", " project"],
                        shell=True)

        # Retrieve map PNG from project_folder/maps
        subprocess.call([
            "cp", "/opt/cgview_comparison_tool/project/maps/medium.png",
            self.shared_folder
        ])
        subprocess.call([
            "cp", "/opt/cgview_comparison_tool/project/maps/medium.html",
            self.shared_folder
        ])

        # Resize image
        basewidth = 900
        img = Image.open('/opt/cgview_comparison_tool/project/maps/medium.png')
        wpercent = (basewidth / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img = img.resize((basewidth, hsize), Image.ANTIALIAS)
        # img = img.resize((600, 600), Image.ANTIALIAS)
        img.save('/opt/cgview_comparison_tool/project/maps/medium1.png',
                 quality=95)
        # print("=====", os.listdir("/opt/cgview_comparison_tool/project/maps/"))
        subprocess.call([
            "cp", "/opt/cgview_comparison_tool/project/maps/medium1.png",
            self.shared_folder
        ])

        png_dir = os.path.join(self.shared_folder, 'medium1.png')
        png_dir_higher = os.path.join(self.shared_folder, 'medium.png')
        html_dir = os.path.join(self.shared_folder, 'medium.html')
        png_dict = {'path': png_dir_higher, 'name': 'Circular_Genome_Map_PNG'}
        html_dict = {'path': png_dir, 'name': 'Circular Genome Map'}
        report_client = KBaseReport(self.callback_url)
        report = report_client.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_dict],
            'file_links': [png_dict],
            'workspace_name':
            params['workspace_name'],
            'summary_window_height':
            900,
            'html_window_height':
            900
        })
        # subprocess.check_output(["cd", "/opt/cgview_comparison_tool"], shell=True)
        # proj_output = subprocess.check_output(["pwd"], shell=True)
        # print("=====cd /opt/cgview_comparison_tool=====", proj_output)
        #
        # report = KBaseReport(self.callback_url)
        # report_info = report.create({'report': {'objects_created':[],
        #                                         'text_message': params['input_file']},
        #                                         'workspace_name': params['workspace_name']})
        output = {
            'report_name': report['name'],
            'report_ref': report['ref'],
        }
        #END run_CGView

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_CGView return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #6
0
    def run_classify(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_classify

        logging.info(params)

        params = Params(params)
        Var.params = params
        '''
        tmp/                                        `shared_folder`
        └── kb_rdp_clsf_<uuid>/                      `run_dir`
            ├── return/                             `return_dir`
            |   ├── cmd.txt
            |   ├── study_seqs.fna
            |   └── RDP_Classifier_output/          `out_dir`
            |       ├── out_allRank.tsv
            |       └── out_fixedRank.tsv
            └── report/                             `report_dir`
                ├── pie_hist.html
                ├── suburst.html
                └── report.html
        '''

        ##
        ## set up globals ds `Var` for this API-method run
        ## which involves making this API-method run's directory structure

        Var.update({
            'run_dir':
            os.path.join(self.shared_folder,
                         'kb_rdp_clsf_' + str(uuid.uuid4())),
            'dfu':
            DataFileUtil(self.callback_url),
            'ws':
            Workspace(self.workspace_url),
            'gapi':
            GenericsAPI(self.callback_url),
            'kbr':
            KBaseReport(self.callback_url),
            'warnings': [],
        })

        os.mkdir(Var.run_dir)

        Var.update({
            'return_dir': os.path.join(Var.run_dir, 'return'),
            'report_dir': os.path.join(Var.run_dir, 'report'),
        })

        os.mkdir(Var.return_dir)
        os.mkdir(Var.report_dir)

        Var.update(
            {'out_dir': os.path.join(Var.return_dir, 'RDP_Classifier_output')})

        os.mkdir(Var.out_dir)

        # cat and gunzip SILVA refdata
        # which has been split into ~99MB chunks to get onto Github
        #if params.is_custom():
        #    app_file.prep_refdata()

        #
        ##
        ### load objects
        ####
        #####

        amp_mat = AmpliconMatrix(params['amp_mat_upa'])
        row_attr_map_upa = amp_mat.obj.get('row_attributemapping_ref')

        create_row_attr_map = row_attr_map_upa is None
        row_attr_map = AttributeMapping(row_attr_map_upa, amp_mat=amp_mat)

        #
        ##
        ### cmd
        ####
        #####

        fasta_flpth = os.path.join(Var.return_dir, 'study_seqs.fna')
        Var.out_allRank_flpth = os.path.join(Var.out_dir, 'out_allRank.tsv')
        Var.out_shortSeq_flpth = os.path.join(
            Var.out_dir,
            'out_unclassifiedShortSeqs.txt')  # seqs too short to classify

        shutil.copyfile(amp_mat.get_fasta(), fasta_flpth)

        cmd = ('java -Xmx4g -jar %s classify %s ' %
               (Var.classifier_jar_flpth, fasta_flpth) +
               ' '.join(params.cli_args) + ' ' + '--format allRank ' +
               '--outputFile %s --shortseq_outfile %s' %
               (Var.out_allRank_flpth, Var.out_shortSeq_flpth))

        run_check(cmd)

        #
        ##
        ### extract classifications
        ####
        #####

        id2taxStr = app_file.get_fix_filtered_id2tax()

        # get ids of classified and unclassified seqs
        shortSeq_id_l = app_file.parse_shortSeq(
        )  # sequences too short to get clsf
        classified_id_l = list(id2taxStr.keys())

        # make sure classifieds and shorts complement
        if Var.debug:
            ret = sorted(classified_id_l + shortSeq_id_l)
            mat = sorted(amp_mat.obj['data']['row_ids'])
            assert ret == mat, \
                'diff1: %s, diff2: %s' % (set(ret)-set(mat), set(mat)-set(ret))

        if len(classified_id_l) == 0:
            raise Exception('No sequences were long enough to be classified')

        # add in id->'' for unclassified seqs
        # so id2taxStr_l is complete
        # so no KeyErrors later
        for shortSeq_id in shortSeq_id_l:
            id2taxStr[shortSeq_id] = ''

        # add to globals for testing
        Var.shortSeq_id_l = shortSeq_id_l

        #
        ##
        ### add to row AttributeMapping
        ####
        #####

        prose_args = params.get_prose_args()

        attribute = ('RDP Classifier Taxonomy (conf=%s, gene=%s)' %
                     (prose_args['conf'], prose_args['gene']))
        attribute_names = row_attr_map.get_attribute_names()
        if attribute in attribute_names:
            attribute = get_numbered_duplicate(attribute_names, attribute)

        source = 'RDP Classifier'

        ind, attribute = row_attr_map.add_attribute_slot(attribute, source)
        row_attr_map.update_attribute(ind, id2taxStr)

        #
        ##
        ### save obj
        ####
        #####

        amp_mat_output_name = Var.params['output_name']
        attr_map_output_name = (amp_mat_output_name + '.Amplicon_attributes'
                                if create_row_attr_map else None)

        row_attr_map_upa_new = row_attr_map.save(name=attr_map_output_name)

        amp_mat.obj['row_attributemapping_ref'] = row_attr_map_upa_new
        amp_mat_upa_new = amp_mat.save(amp_mat_output_name)

        objects_created = [
            dict(  # row AttrMap
                ref=row_attr_map_upa_new,
                description='%sAdded attribute `%s`' % (
                    'Created. ' if create_row_attr_map else '',
                    attribute,
                )),
            dict(  # AmpMat
                ref=amp_mat_upa_new,
                description=
                'Updated amplicon AttributeMapping reference to `%s`' %
                row_attr_map_upa_new),
        ]

        # testing
        if Var.debug:
            Var.update(dict(
                amp_mat=amp_mat,
                row_attr_map=row_attr_map,
            ))

        #
        ##
        ### html report
        ####
        #####

        hrw = report.HTMLReportWriter(cmd_l=[cmd])

        html_flpth = hrw.write()

        html_links = [{
            'path': Var.report_dir,
            'name': os.path.basename(html_flpth),
        }]

        #
        ##
        ###
        ####
        #####

        file_links = [{
            'path': Var.run_dir,
            'name': 'RDP_Classifier_results.zip',
            'description': 'Input, output'
        }]

        params_report = {
            'warnings': Var.warnings,
            'objects_created': objects_created,
            'html_links': html_links,
            'direct_html_link_index': 0,
            'file_links': file_links,
            'workspace_id': params['workspace_id'],
            'html_window_height': Var.report_height,
        }

        # testing
        Var.params_report = params_report

        report_obj = Var.kbr.create_extended_report(params_report)

        output = {
            'report_name': report_obj['name'],
            'report_ref': report_obj['ref'],
        }

        #END run_classify

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_classify return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #7
0
    def run_picrust2_pipeline(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_picrust2_pipeline
        ####################################################################################################
        ####################################################################################################
        ####################################################################################################
        ####################################################################################################
        ####################################################################################################

        logging.info(params)

        #
        ##
        ### params, app-globals, directories, etc
        ####
        #####

        logging.info('BEGINNING KB_PICRUST2. params: %s' % str(params))

        params = Params(params)

        dprint('params', run=locals())

        reset_Var()  # clear all fields but `debug`

        Var.update(
            params=params,
            dfu=DataFileUtil(self.callback_url),
            kbr=KBaseReport(self.callback_url),
            fpu=FunctionalProfileUtil(self.callback_url, service_ver='beta'),
            gapi=GenericsAPI(self.callback_url),
            shared_folder=self.shared_folder,
            run_dir=os.path.join(self.shared_folder,
                                 'run_dir_picrust2_' + str(uuid.uuid4())),
            warnings=[],
            objects_created=[],
        )

        os.mkdir(Var.run_dir)  # for this API-method run

        Var.update(return_dir=os.path.join(Var.run_dir, 'return'), )

        os.mkdir(Var.return_dir)  # for return input/output/logs etc.

        if Var.debug:
            with open(os.path.join(Var.run_dir, '#params'), 'w') as fh:
                json.dump(params.params, fh)

        # TODO document `run_dir` structure

        #
        ##
        ### obj
        ####
        #####

        # instantiate

        amp_mat = AmpliconMatrix(params['amplicon_matrix_upa'])
        if 'row_attributemapping_ref' in amp_mat.obj:
            row_attrmap = AttributeMapping(
                amp_mat.obj['row_attributemapping_ref'], amp_mat)
        else:
            msg = (
                "Input AmpliconMatrix "
                "does not have a row AttributeMapping to assign PICRUSt2 functions to."
            )
            logging.warning(msg)
            Var.warnings.append(msg)

        # validate input data

        amp_mat.validate_amplicon_abundance_data()

        # generate input files

        seq_flpth = os.path.join(Var.return_dir, 'study_seqs.fna')
        seq_abundance_table_flpth = os.path.join(Var.return_dir,
                                                 'study_seqs.tsv')

        amp_mat.to_fasta(seq_flpth)
        amp_mat.to_seq_abundance_table(seq_abundance_table_flpth)

        # objs should be app globals
        Var.amp_mat = amp_mat

        #
        ##
        ### args
        ####
        #####

        # TODO get tee functionality working in run_check
        # to avoid extra cmd

        Var.out_dir = os.path.join(Var.return_dir, 'PICRUSt2_output')
        log_flpth = os.path.join(Var.return_dir, 'log.txt')
        p = 4

        cmd_pipeline = ' '.join([
            'set -o pipefail &&',
            'source activate picrust2 &&',
            'picrust2_pipeline.py',
            '-s',
            seq_flpth,
            '-i',
            seq_abundance_table_flpth,
            '-o',
            Var.out_dir,
            '--per_sequence_contrib',
            '-p',
            str(p),
            '|& tee',
            log_flpth,
        ])

        cmd_description = ' \\\n'.join([
            'cd %s &&' % Var.out_dir, 'source activate picrust2 &&',
            'add_descriptions.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -m EC',
            '                    -o EC_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz',
            '&&',
            'add_descriptions.py -i KO_metagenome_out/pred_metagenome_unstrat.tsv.gz -m KO',
            '                    -o KO_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz',
            '&&',
            'add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -m METACYC',
            '                    -o pathways_out/path_abun_unstrat_descrip.tsv.gz'
        ])

        get_cmd_func_l = lambda FUNC: [
            ('cd %s && ' % Var.out_dir + 'source activate picrust2 && '
             f'hsp.py -i {FUNC} -t out.tre -o {FUNC}_predicted.tsv.gz -p {p}'),
            ('cd %s && ' % Var.out_dir + 'source activate picrust2 && '
             'metagenome_pipeline.py '
             '-i ../%s ' % os.path.basename(seq_abundance_table_flpth) +
             '-m marker_predicted_and_nsti.tsv.gz '
             f'-f {FUNC}_predicted.tsv.gz '
             f'-o {FUNC}_metagenome_out')
        ] + ([] if FUNC == 'PHENO' else [  # no descriptions for IMG phenotype
            ('cd %s && ' % Var.out_dir + 'source activate picrust2 && '
             f'add_descriptions.py -i {FUNC}_metagenome_out/pred_metagenome_unstrat.tsv.gz -m {FUNC} '
             f'-o {FUNC}_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz'
             ),
        ])

        cmd_func_l = []
        for func in ['cog', 'pfam', 'tigrfam', 'pheno']:
            if params.getd(func) == 1:
                cmd_func_l.extend(get_cmd_func_l(func.upper()))

        #
        ##
        ### run
        ####
        #####

        run_check(cmd_pipeline)
        run_check(cmd_description)
        for cmd_func in cmd_func_l:
            run_check(cmd_func)

        #
        ##
        ### sanity checks
        ####
        #####

        if Var.debug:
            for func in Var.func_l:
                if not Var.params.getd(func):
                    continue

                fp0 = os.path.join(Var.out_dir,
                                   Var.func_2_cfg[func]['relfp'][0])
                fp1 = os.path.join(Var.out_dir,
                                   Var.func_2_cfg[func]['relfp'][1])

                # Check dropped amplicons are the unaligned/distant ones (debug)
                appfile.check_dropped_amplicon_ids(fp0, amp_mat)
                # Check no samples dropped (debug)
                appfile.check_dropped_sample_ids(fp1, amp_mat)

        #
        ##
        ### update/save Amplicon workflow objects
        ####
        #####

        path_abun_predictions_tsv_gz_flpth = os.path.join(
            Var.out_dir, 'pathways_out/path_abun_predictions.tsv.gz')

        attribute = 'MetaCyc Predictions'
        source = 'PICRUSt2'

        # if row AttributeMapping,
        # update that and referencing objs
        if amp_mat.row_attrmap_upa is not None:

            # update row AttributeMapping with traits
            id2attr = appfile.parse_picrust2_traits(
                path_abun_predictions_tsv_gz_flpth)
            ind, attribute = row_attrmap.add_attribute_slot(attribute, source)
            row_attrmap.map_update_attribute(ind, id2attr)
            row_attrmap_upa_new = row_attrmap.save()

            # update AmpliconMatrix which references row AttributeMapping
            amp_mat.obj['row_attributemapping_ref'] = row_attrmap_upa_new
            amp_mat_upa_new = amp_mat.save(name=params['output_name'])

            Var.objects_created.extend([
                {
                    'ref': row_attrmap_upa_new,
                    'description': 'Added attribute `%s`' % attribute,
                },
                {
                    'ref':
                    amp_mat_upa_new,
                    'description':
                    'Updated amplicon AttributeMapping reference to `%s`' %
                    row_attrmap_upa_new
                },
            ])

        #
        ##
        ### html report w/ heatmaps
        ####
        #####

        logging.info('Beginning report business')

        ##
        ## report

        Var.report_dir = os.path.join(Var.run_dir, 'report')

        report_html_flpth = report.HTMLReportWriter(
            [cmd_pipeline, cmd_description] + cmd_func_l, ).write()

        html_links = [{
            'path': Var.report_dir,
            'name': os.path.basename(report_html_flpth),
        }]

        #
        ##
        ### FunctionalProfile
        ####
        #####
        logging.info('Starting saving FunctionalProfiles if any')

        if Var.debug:
            FP_amp_mat_ref = params[
                'amplicon_matrix_upa']  # this makes mocking more flexible in case something makes a fake UPA
        else:
            FP_amp_mat_ref = amp_mat_upa_new  # this AmpliconMatrix is new one with new AttributeMapping

        # gunzip TSVs out to another directory
        tsv_dir = os.path.join(Var.run_dir, 'decompressed_tsv')
        os.mkdir(tsv_dir)

        for func in Var.func_l:
            if not Var.params.getd(func):
                continue

            func_name = Var.func_2_cfg[func]['name']

            if Var.params.getd('create_amplicon_fps'):
                id = 'amplicon_' + func
                desc = 'Amplicon %s abundance' % func_name

                fp_src = os.path.join(Var.out_dir,
                                      Var.func_2_cfg[func]['relfp'][0])
                fp_dst = os.path.join(tsv_dir, id + '.tsv')
                gunzip(fp_src, fp_dst)

                upa = Var.fpu.import_func_profile(
                    dict(
                        workspace_id=Var.params['workspace_id'],
                        func_profile_obj_name='%s.%s' %
                        (Var.params['output_name'], id),
                        original_matrix_ref=FP_amp_mat_ref,
                        profile_file_path=fp_dst,
                        profile_type='amplicon',
                        profile_category='organism',
                        data_epistemology='predicted',
                        epistemology_method='PICRUSt2',
                        description=desc,
                    ))['func_profile_ref']

                Var.objects_created.append(dict(ref=upa, description=desc))

            if Var.params.getd('create_sample_fps'):
                id = 'metagenome_' + func
                desc = 'Metagenome %s abundance' % func_name

                fp_src = os.path.join(Var.out_dir,
                                      Var.func_2_cfg[func]['relfp'][1])
                fp_dst = os.path.join(tsv_dir, id + '.tsv')
                gunzip(fp_src, fp_dst)

                upa = Var.fpu.import_func_profile(
                    dict(
                        workspace_id=Var.params['workspace_id'],
                        func_profile_obj_name='%s.%s' %
                        (Var.params['output_name'], id),
                        original_matrix_ref=FP_amp_mat_ref,
                        profile_file_path=fp_dst,
                        profile_type='mg',
                        profile_category='community',
                        data_epistemology='predicted',
                        epistemology_method='PICRUSt2',
                        description=desc,
                    ))['func_profile_ref']

                Var.objects_created.append(dict(ref=upa, description=desc))

        # look at TSVs
        dprint(
            'ls -lh %s/*' % tsv_dir,
            #'file -i %s/*/*' % tsv_dir,
            run='cli')

        #
        ##
        ### return files
        ####
        #####

        file_links = [{
            'path':
            Var.return_dir,
            'name':
            'PICRUSt2_results.zip',
            'description':
            'Input, output, cmd, intermediate files, log'
        }]

        params_report = {
            'warnings': Var.warnings,
            'objects_created': Var.objects_created,
            'file_links': file_links,
            'html_links': html_links,
            'direct_html_link_index': 0,
            'report_object_name': 'kb_PICRUSt2_report',
            'workspace_name': params['workspace_name'],
            'html_window_height': report.REPORT_HEIGHT,
        }

        Var.params_report = params_report

        obj = Var.kbr.create_extended_report(params_report)

        output = {
            'report_name': obj['name'],
            'report_ref': obj['ref'],
        }

        #END run_picrust2_pipeline

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_picrust2_pipeline return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #8
0
    def update_matrix_attribute_mapping(self, params):

        dimension = params.get('dimension')
        if dimension not in ['col', 'row']:
            raise ValueError('Please use "col" or "row" for input dimension')

        workspace_name = params.get('workspace_name')

        old_matrix_ref = params.get('input_matrix_ref')
        old_matrix_obj = self.dfu.get_objects(
            {'object_refs': [old_matrix_ref]})['data'][0]
        old_matrix_info = old_matrix_obj['info']
        old_matrix_data = old_matrix_obj['data']

        old_am_ref = old_matrix_data.get(
            '{}_attributemapping_ref'.format(dimension))

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        if not old_am_ref:
            raise ValueError(
                'Matrix object does not have {} attribute mapping'.format(
                    dimension))

        new_am_ref = self.append_file_to_attribute_mapping(
            params['staging_file_subdir_path'], old_am_ref, workspace_id,
            params['output_am_obj_name'])['attribute_mapping_ref']

        old_matrix_data['{}_attributemapping_ref'.format(
            dimension)] = new_am_ref

        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": old_matrix_info[2],
                "data": old_matrix_data,
                "name": params['output_matrix_obj_name']
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_am_ref,
            'description': 'Updated Attribute Mapping'
        }, {
            'ref': new_matrix_obj_ref,
            'description': 'Updated Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'new_attribute_mapping_ref': new_am_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "find_motifs_params" (SS_ref -
           optional, used for exact genome locations if possible) ->
           structure: parameter "workspace_name" of String, parameter
           "fastapath" of String, parameter "motif_min_length" of Long,
           parameter "motif_max_length" of Long, parameter "SS_ref" of
           String, parameter "obj_name" of String, parameter "background" of
           Long
        :returns: instance of type "extract_output_params" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs
        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16

        motMin = params['motif_min_length']
        motMax = params['motif_max_length']

        promoterFastaFilePath = self.SSU.SeqSetToFasta({
            'ws_name':
            params['workspace_name'],
            'SS_ref':
            params['SS_ref']
        })['path']

        MEMEMotifCommand = self.MEU.build_meme_command(promoterFastaFilePath,
                                                       motMin, motMax,
                                                       params['background'])
        meme_out_path = self.MEU.run_meme_command(MEMEMotifCommand)

        meme_params = {
            'ws_name': params['workspace_name'],
            'format': 'MEME',
            'file': {
                'path': meme_out_path
            },
            'obj_name': params['obj_name'],
            'seq_set_ref': params['SS_ref']
        }

        # MOU.parseMotifSet with the same parameters will
        # return a dictionary of the motifset object that you save on
        # your own
        #
        # MOU.saveMotifSet will save the object with DataFileUtils to
        # whatever workspace you specify in ws_name
        #
        # This function will also download the sequence set as a fasta to
        # unique (uuid4) file name in the scratch directory

        obj_ref = self.MOU.saveMotifSet(meme_params)

        timestamp = str(
            int((datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000))
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)

        get_obj_params = {'object_refs': [obj_ref]}
        memeMotifSet = self.dfu.get_objects(get_obj_params)['data'][0]['data']
        self.GR.MakeMotifReport(htmlDir, memeMotifSet)

        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except Exception:
            raise ValueError('Error uploading HTML file: ' + str(htmlDir) +
                             ' to shock')

        reportname = 'MEMEMotifFinder_report_' + str(uuid.uuid4())

        reportobj = {
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Motif Set generated by MEME'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportname
        }

        # attach to report obj
        reportobj['direct_html'] = ''
        reportobj['direct_html_link_index'] = 0
        reportobj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportobj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #10
0
class snp2gene:
    '''
    Module Name:
    snp2gene

    Module Description:
    A KBase module: snp2gene
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "[email protected]:kbasecollaborations/snp2gene.git"
    GIT_COMMIT_HASH = "8dd593e96c4b37fcf91a719181389e1b04c0bb4a"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.config['callback_url'] = os.environ['SDK_CALLBACK_URL']
        callback_url = self.config['callback_url']
        self.shared_folder = config['scratch']
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.kbr = KBaseReport(callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def annotate_gwas_results(self, ctx, params):
        """
        annotate_gwas_results:
        inputs:
            file path to gwas results
            genome object - with reference to GFF file
        outputs:
            TSV file represented by shock/handle ids and
        :param params: instance of type "annotate_gwas_input" -> structure:
           parameter "gwas_result_file" of type "file_path" (A valid file
           path), parameter "genome_obj" of type "genome_ref" (KBase style
           object reference X/Y/Z @id ws KBaseGenomes.Genome)
        :returns: instance of type "annotate_gwas_output" -> structure:
           parameter "snp_to_gene_list" of type "file_path" (A valid file
           path)
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_gwas_results

        gene_list = GFFUtils(self.config).annotate_GWAS_results(
            params['genome_obj'], params['gwas_result_file'])

        output = {'snp_to_gene_list': gene_list}

        #END annotate_gwas_results

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_gwas_results return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def annotate_gwas_results_app(self, ctx, params):
        """
        :param params: instance of type "annotate_gwas_app_input" ->
           structure: parameter "associations" of list of type
           "association_ref" (KBase style object reference X/Y/Z @id ws
           KBaseGwasData.Associations), parameter "p_value" of String,
           parameter "prefix" of String
        :returns: instance of type "annotate_gwas_app_output" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "featureset_obj" of type "featureset_ref" (KBase
           style object reference X/Y/Z @id ws KBaseCollections.FeatureSet)
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_gwas_results_app
        # return the results
        print(params)
        #TODO: Hanlde cases where there are no significant SNPs
        #genome_ref = "47506/4/1"
        objects_created = []
        for association_ref in params['associations']:

            variation_ref = self.wsc.get_object_subset([{
                'included': ['/variation_id'],
                'ref':
                association_ref
            }])[0]['data']['variation_id']

            genome_ref = self.wsc.get_object_subset([{
                'included': ['/genome_ref'],
                'ref':
                variation_ref
            }])[0]['data']['genome_ref']

            featureset_obj = GFFUtils2(self.config).annotate_GWAS_results(
                genome_ref, association_ref, params['workspace_name'],
                params['prefix'], params['p_value'])
            objects_created.append({
                'ref': featureset_obj,
                'description': 'FeatureSet'
            })
        # Build the new gff before doing anything

        # Download the workspace object for association one at a time
        # Filter SNPs for p-value, if no snps shows up, append this to warnings
        # Build the table structure needed for snp2gene
        # Run snp2gene algorithm and get final list.txt
        # Save as featureset. Find how to save featureset from genelist

        report_info = self.kbr.create_extended_report({
            'message':
            ' ',
            'objects_created':
            objects_created,
            'report_object_name':
            'annotate_gwas_results_app_' + str(uuid.uuid4()),
            'workspace_name':
            params['workspace_name']
        })
        output = dict()
        output['report_name'] = report_info['name']
        output['report_ref'] = report_info['ref']
        print(output)

        #END annotate_gwas_results_app

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_gwas_results_app return value ' +
                             'output is not type dict as required.')
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
    def link_reads(self, ctx, params):
        """
        :param params: instance of type "LinkReadsParams" -> structure:
           parameter "workspace_name" of String, parameter "workspace_id" of
           String, parameter "sample_set_ref" of String, parameter "links" of
           list of type "ReadsLink" (Create links between samples and reads
           objects.) -> structure: parameter "sample_name" of String,
           parameter "reads_ref" of String
        :returns: instance of type "LinkReadsOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "links" of list of unspecified object
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN link_reads
        logging.info(params)

        ss = SampleService(self.sample_url)

        sample_set_ref = params['sample_set_ref']
        sample_set_obj = self.dfu.get_objects(
            {'object_refs': [sample_set_ref]})['data'][0]['data']
        sample_name_2_info = {d['name']: d for d in sample_set_obj['samples']}

        links = [(d['sample_name'][0], d['reads_ref'])
                 for d in params['links']]

        new_data_links = []
        for sample_name, reads_ref in links:
            sample_id = sample_name_2_info[sample_name]['id']
            version = sample_name_2_info[sample_name]['version']
            sample = ss.get_sample({
                'id': sample_id,
                'version': version,
            })
            ret = ss.create_data_link(
                dict(
                    upa=reads_ref,
                    id=sample_id,
                    version=version,
                    node=sample['node_tree'][0]['id'],
                    update=1,
                ))
            new_data_links.append(ret)

        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'links': new_data_links,
        }
        #END link_reads

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method link_reads return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #12
0
    def run_FAPROTAX(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FAPROTAX

        logging.info(params)

        Var.update({ # carry over into globals `Var`, regardless of resetting, for all API-method runs
            'params': Params(params),
            'shared_folder': self.shared_folder,
            'kbase_endpoint': self.config['kbase-endpoint'], # contains environment, for constructing Genome landing page url
            #---
            'ws': Workspace(self.workspace_url),
            'dfu': DataFileUtil(self.callback_url), # instantiate here so within runtime of @patch
            'kbr': KBaseReport(self.callback_url), # instantiate here so within runtime of @patch 
            'gapi': GenericsAPI(self.callback_url),
            'fpu': FunctionalProfileUtil(self.callback_url, service_ver='beta'), # TODO overhead?
            #---
            'warnings': [],
            #---
            'run_dir': os.path.join(self.shared_folder, 'kbfptx_' + str(uuid.uuid4())),
        })

        os.mkdir(Var.run_dir)
        Var.update({
            'return_dir': os.path.join(Var.run_dir, 'return'),
        })
        os.mkdir(Var.return_dir)

        #
        ##
        ### detect input type
        ####
        #####

        oi = Var.ws.get_object_info3(
            {'objects': [{
                'ref': params['input_upa']
            }]})['infos'][0]

        if oi[2].startswith('KBaseSearch.GenomeSet'):
            output = do_GenomeSet_workflow()

        elif oi[2].startswith('KBaseMatrices.AmpliconMatrix'):
            output = do_AmpliconMatrix_workflow()

        else:
            raise Exception('Unknown type `%s` for `input_upa`' % oi[2])

        #END run_FAPROTAX

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FAPROTAX return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "find_motifs_params" -> structure:
           parameter "workspace_name" of String, parameter "fastapath" of
           String, parameter "motif_min_length" of Long, parameter
           "motif_max_length" of Long, parameter "SS_ref" of String,
           parameter "obj_name" of String
        :returns: instance of type "extract_output_params" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs
        if 'motif_length' not in params:
            params['motif_length'] = 8
        motLen = params['motif_length']

        promoterFastaFilePath = params['fastapath']

        MDU=MdscanUtil()
        MdscanMotifCommand = MDU.build_mdscan_motif_command(promoterFastaFilePath,motLen,params['background'])
        MDU.run_mdscan_command(MdscanMotifCommand)
        mdscan_out_path = '/kb/module/work/tmp/mdscan_out'
        mdscan_params = {'ws_name' : params['workspace_name'], 'path' : mdscan_out_path,'obj_name' : params['obj_name']}
        MOU = MotifUtils(self.callback_url)
        dfu = DataFileUtil(self.callback_url)
        locDict = {}
        
        obj_ref = MDU.UploadFromMdscan(self.callback_url, mdscan_params)[0]['obj_ref']    
        MDU.write_obj_ref(mdscan_out_path, obj_ref)
        
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' +  timestamp
        os.mkdir(htmlDir)
        lineCount = 0
        with open(promoterFastaFilePath,'r') as pFile:
            for line in pFile:
                lineCount += 1
        numFeat = lineCount/2
        with open(promoterFastaFilePath,'r') as pFile:
            fileStr = pFile.read()
        promHtmlStr = '<html><body> '  + fileStr + ' </body></html>'
        with open(htmlDir + '/promoters.html','w') as promHTML:
            promHTML.write(promHtmlStr)
        JsonPath = '/kb/module/work/tmp'

        dfu = DataFileUtil(self.callback_url)
        get_obj_params = {'object_refs' : [obj_ref]}
        mdscanMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data']
        mr=MakeNewReport()
        mr.MakeReport(htmlDir,mdscanMotifSet)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')


        reportName = 'MdscanMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by Mdscan'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    #'name': 'promoter_download.zip',
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportObj)
        output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "find_motifs_params" (SS_ref -
           optional, used for exact genome locations if possible) ->
           structure: parameter "workspace_name" of String, parameter
           "fastapath" of String, parameter "motif_min_length" of Long,
           parameter "motif_max_length" of Long, parameter "SS_ref" of String
        :returns: instance of type "extract_output_params" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs
        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']

        #promoterFastaFilePath = self.get_promoter_for_gene(ctx,params)[0]
        promoterFastaFilePath = params['fastapath']

        #GibbsMotifCommand = GU.build_gibbs_command(promoterFastaFilePath)
        gibbsCommandList = []
        for i in range(motMin, motMax + 1, 2):
            gibbsCommandList.append(
                GU.build_gibbs_command(promoterFastaFilePath, i))

        for g in gibbsCommandList:
            GU.run_gibbs_command(g)
        #GU.run_gibbs_command(GibbsMotifCommand)
        gibbs_out_path = '/kb/module/work/tmp/gibbs'
        gibbs_params = {
            'ws_name': params['workspace_name'],
            'path': gibbs_out_path,
            'obj_name': params['obj_name']
        }
        MOU = MotifUtils(self.callback_url)
        dfu = DataFileUtil(self.callback_url)
        locDict = {}
        if 'SS_ref' in params:
            get_ss_params = {'object_refs': [params['SS_ref']]}
            SS = dfu.get_objects(get_ss_params)['data'][0]['data']
            for s in SS['sequences']:
                if s['source'] is not None:
                    locDict['sequence_id'] = {
                        'contig': s['source']['location'][0][0],
                        'start': str(s['source']['location'][0][1])
                    }
        if len(locDict.keys()) > 0:
            gibbs_params['absolute_locations'] = locDict
        gibbs_params['min_len'] = motMin
        gibbs_params['max_len'] = motMax
        obj_ref = MOU.UploadFromGibbs(gibbs_params)['obj_ref']
        #memeMotifList = MEU.parse_meme_output()

        #HERE:
        #we've got object ref
        #we've got html building functions
        #build report, setup return,
        #make report and return it

        #buildReportFromMotifSet()

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        lineCount = 0
        with open(promoterFastaFilePath, 'r') as pFile:
            for line in pFile:
                lineCount += 1
        numFeat = lineCount / 2
        with open(promoterFastaFilePath, 'r') as pFile:
            fileStr = pFile.read()
        promHtmlStr = '<html><body> ' + fileStr + ' </body></html>'
        with open(htmlDir + '/promoters.html', 'w') as promHTML:
            promHTML.write(promHtmlStr)
        JsonPath = '/kb/module/work/tmp'

        dfu = DataFileUtil(self.callback_url)
        get_obj_params = {'object_refs': [obj_ref]}
        gibbsMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data']
        MakeReport(htmlDir, gibbsMotifSet)
        #buildReportFromMotifSet(gibbsMotifSet,htmlDir,'gibbs')

        #TODO: Here replace the makereport with a call to motifset utils
        #subprocess.call(['python','/kb/module/lib/identify_promoter/Utils/makeReport.py',JsonPath + '/meme_out/meme.json',htmlDir + '/meme.html',str(numFeat)])
        #fullMotifList = []
        #for m in memeMotifList:
        #    fullMotifList.append(m)

        #What needs to happen here:
        #call makeLogo for each of the json outputs(capture these from somewhere)

        #plt.rcParams['figure.dpi'] = 300

        #htmlFiles = ['index.html','gibbs.html','homer.html']
        #shockParamsList = []
        #for f in htmlFiles:
        #    shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'})

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        #MSO = {}
        #MSO['Condition'] = 'Temp'
        #MSO['FeatureSet_ref'] = '123'
        #MSO['Motifs'] = []
        #MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = {}
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0

        #MSU.parseMotifList(fullMotifList,MSO)
        #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000))

        #Pass motif set into this
        #save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}]

        #info = dfu.save_objects(save_objects_params)[0]
        #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'GibbsMotifFinder_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Motif Set generated by Gibbs'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #15
0
    def ad_vina(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN ad_vina

        VarStash.update({'ctx': ctx, 'workspace_id': params['workspace_id']})

        ctx_censored = ctx.copy()
        ctx_censored.pop('token')

        dprint('params', 'ctx_censored', run=locals())

        ##
        ####
        ###### param validation
        ###### and defaulting
        ####
        ##

        params_search_space = params['search_space']

        ##
        ## if center specified, must be completely specified

        key_center_l = ['center_' + ch for ch in list('xyz')]
        center_xyz = [params_search_space[key] for key in key_center_l]

        if any(center_xyz) and not all(center_xyz):
            raise ValueError(
                'INPUT ERROR: '
                'If any of center (i.e., center_x, center_y, center_z) is specified, all of center must be specified. '
                'Please try again')
        """
        ##
        ## must specify center to specify any of size

        key_size_l = ['size_' + ch for ch in list('xyz')]
        size_xyz = [params_search_space[key] for key in key_size_l]

        if not all(center_xyz) and any(size_xyz):
            raise ValueError(   
                "INPUT ERROR: "
                "Must completely specify center (i.e., center_x, center_y, center_z) before specifying any of size (i.e., size_x, size_y, size_z). "
                "(Also, if any of size is unspecified, it will default to 30 Angstroms.) "
                "Please try again"
                )
        """

        ##
        ## if center specified, fill in default size

        size_default = 30  # Angstroms

        key_size_l = ['size_' + ch for ch in list('xyz')]
        size_xyz = [params_search_space[key] for key in key_size_l]

        if all(center_xyz) and not all(size_xyz):
            for key_size in key_size_l:
                if not params_search_space.get(key_size):
                    params_search_space[key_size] = size_default

        ##
        ####
        ###### dl
        ####
        ##

        ps = ProteinStructure(params['pdb_ref'])
        ps.calc_center_size()
        ps.convert_to_pdbqt()

        cs = CompoundSet(params['ligand_list_ref'])
        cs.split_multiple_models()

        dprint(ChemKBaseObj.created_instances)

        ##
        ####
        ###### params
        ###### run
        ####
        ##

        ##

        params_static = {'cpu': 4}

        params_default = {
            'num_modes': 1000,
            'energy_range': 10,
            'exhaustiveness': 20,
        }

        ##

        key_search_space_l = key_center_l + key_size_l
        key_misc_l = ['num_modes', 'energy_range', 'seed', 'exhaustiveness']

        ##

        out_pdbqt_filename_l = []
        log_filename_l = []

        ##
        ## for each ligand

        for ligand_name, ligand_pdbqt_filepath in zip(cs.pdbqt_compound_l,
                                                      cs.pdbqt_filepath_l):

            if params.get('skip_vina'):
                break

            run_name = ligand_name + '_vs_' + ps.name

            out_pdbqt_filename_l.append(run_name + '.pdbqt')
            log_filename_l.append(run_name + '.log')

            ##
            ## set up default params

            params_vina = {
                'receptor': ps.pdbqt_filepath,
                'ligand': ligand_pdbqt_filepath,
                'log': run_name + '.log',
                'out': run_name + '.pdbqt',
                **params_static,
                **params_default
            }

            for space_coords_name in ['center', 'size']:
                space_coords = getattr(ps, space_coords_name)
                for k, v in zip(list('xyz'), space_coords):
                    params_vina[space_coords_name + '_' + k] = v

            ##
            ## check for search_space and misc params

            for key in key_misc_l:
                if params.get(key):
                    params_vina[key] = params[key]

            for key in key_search_space_l:
                if params_search_space.get(key):
                    params_vina[key] = params_search_space[key]

            ##
            ##

            cmd = 'vina'

            for param, arg in params_vina.items():
                cmd += ' --' + param + ' ' + str(arg)
            """
            _cmd = ( f"vina --receptor {ps.pdbqt_filepath} --ligand {ligand_pdbqt_filepath} "
                     f"--cpu 4 --log {run_name + '.log'} "
                     f"--center_x {ps.center[0]} --center_y {ps.center[1]} --center_z {ps.center[2]} "
                     f"--size_x {ps.size[0]} --size_y {ps.size[1]} --size_z {ps.size[2]} "
                     f"--out {run_name + '.pdbqt'}" )
            """

            retcode, stdout, stderr = dprint(
                cmd,
                run='cli',
                subproc_run_kwargs={'cwd': VarStash.shared_folder})
            if retcode != 0:
                sep = '--------------------------------------------------------------------------'
                raise RuntimeError(
                    f"AutoDock terminated abnormally with error message: "
                    f"[{stderr}] "
                    "You can check logs (click 'Job Status' tab in upper right of cell) for more information"
                )

            if params.get('skip_most_vina'):
                break

        ##
        ####
        ###### html
        ####
        ##

        hb = HTMLBuilder(ps, cs)

        ##
        ####
        ###### return directories
        ####
        ##

        def dir_to_shock(dir_path, name, description):
            '''
            For regular directories or html directories
            
            name - for regular directories: the name of the flat file returned to ui
                   for html directories: the name of the html file
            '''
            dfu_fileToShock_ret = VarStash.dfu.file_to_shock({
                'file_path': dir_path,
                'make_handle': 0,
                'pack': 'zip',
            })

            dir_shockInfo = {
                'shock_id': dfu_fileToShock_ret['shock_id'],
                'name': name,
                'description': description
            }

            return dir_shockInfo

        # return files

        dir_retFiles_path = os.path.join(self.shared_folder, 'pdbqt_log_dir')
        os.mkdir(dir_retFiles_path)

        for filename in out_pdbqt_filename_l + log_filename_l:
            shutil.copyfile(os.path.join(self.shared_folder, filename),
                            os.path.join(dir_retFiles_path, filename))

        # so DataFileUtil doesn't crash over zipping an empty folder
        if len(os.listdir(dir_retFiles_path)) == 0:
            dprint(
                rf"echo 'Sorry, no files were generated' > {os.path.join(dir_retFiles_path, 'README')}",
                run='cli')

        dir_retFiles_shockInfo = dir_to_shock(
            dir_retFiles_path, 'pdbqt_log.zip',
            'Generated .pdbqt and log files')

        # html

        html_shockInfo = dir_to_shock(hb.html_dir, 'index.html',
                                      'HTML report for AutoDock Vina')

        ##
        ####
        ###### report
        ####
        ##

        report_params = {
            'message': 'this is the report_params `message`',
            'warnings': ['this is the', 'report_params `warnings`'],
            'direct_html_link_index': 0,  #?0
            'html_links': [html_shockInfo],
            'file_links': [dir_retFiles_shockInfo],
            'report_object_name': 'autodock_vina' + self.suffix,
            'workspace_name': params['workspace_name'],
        }

        kbr = KBaseReport(self.callback_url)
        report_output = kbr.create_extended_report(report_params)

        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref'],
        }

        #END ad_vina

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method ad_vina return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #16
0
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "find_motifs_params" -> structure:
           parameter "workspace_name" of String, parameter "fastapath" of
           String, parameter "prb" of Double, parameter "motif_length" of
           Long, parameter "obj_name" of String, parameter "mask_repeats" of
           Long
        :returns: instance of type "extract_output_params" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs
        '''if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']'''

        if 'motif_length' not in params:
            params['motif_length'] = 20
        if 'prb' not in params:
            params['prb'] = 0.05
        motlen = params['motif_length']
        prb = params['prb']

        FastaFilePath = params['fastapath']
        mfu = mfmdUtil()

        mfmdMotifCommand = mfu.build_mfmd_command(FastaFilePath, motlen, prb,
                                                  self.config)
        mfu.run_mfmd_command(mfmdMotifCommand)
        mfmd_out_path = '/kb/module/work/tmp/mfmd_out'
        mfmd_params = {
            'ws_name': params['workspace_name'],
            'path': mfmd_out_path,
            'location_path': mfmd_out_path + '/mfmd_out.txt',
            'obj_name': params['obj_name']
        }
        MOU = MotifUtils(self.callback_url)
        dfu = DataFileUtil(self.callback_url)
        locDict = {}
        if 'SS_ref' in params:
            get_ss_params = {'object_refs': [params['SS_ref']]}
            SS = dfu.get_objects(get_ss_params)['data'][0]['data']
            for s in SS['sequences']:
                if s['source'] is not None:
                    locDict['sequence_id'] = {
                        'contig': s['source']['location'][0][0],
                        'start': str(s['source']['location'][0][1])
                    }
        if len(locDict.keys()) > 0:
            mfmd_params['absolute_locations'] = locDict
        mfmd_params['motlen'] = motlen
        mfmd_params['prb'] = prb

        obj_ref = mfu.UploadFrommfmd(self.callback_url,
                                     mfmd_params)[0]['obj_ref']
        mfu.write_obj_ref(mfmd_out_path, obj_ref)

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        lineCount = 0

        dfu = DataFileUtil(self.callback_url)
        get_obj_params = {'object_refs': [obj_ref]}
        mfmdMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data']
        mr = MakeNewReport()
        mr.MakeReport(htmlDir, mfmdMotifSet)

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'mfmdMotifFinder_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Motif Set generated by mfmd'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #17
0
    def run_yangdar1en_ContigFilterDemo_max(self, ctx, params):
        """
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_yangdar1en_ContigFilterDemo_max
        for name in ['min_length', 'max_length', 'assembly_ref', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not isinstance(params['min_length'], int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['max_length'], int) or (params['max_length'] < 0):
            raise ValueError('Max length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], str) or not len(params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')
            print(params['min_length'], params['max_length'], params['assembly_ref'])
        if params['max_length'] > 999999999:
            raise ValueError('Max length must be smaller than 999999999')

        assembly_util = AssemblyUtil(self.callback_url)
        fasta_file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})

        # Parse the downloaded file in FASTA format
        parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta')
        min_length = params['min_length']
        max_length = params['max_length']

        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length and len(record.seq) <= max_length:
                good_contigs.append(record)
                n_remaining += 1
        # output = {
        #     'n_total': n_total,
        #     'n_remaining': n_remaining
        # }
        # Create a file to hold the filtered data
        workspace_name = params['workspace_name']
        filtered_path = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {'path': filtered_path},
            'workspace_name': workspace_name,
            'assembly_name': fasta_file['assembly_name']
        })
        ########################################################
        # Create an output summary message for the report
        text_message = "".join([
            'Filtered assembly to ',
            str(n_remaining),
            ' contigs out of ',
            str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [
                {'ref': new_ref, 'description': 'Filtered contigs'}
            ],
            'text_message': text_message
        }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': workspace_name
        })

        # Return the report reference and name in our results
        output = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }

        ########################################################
        # output = {
        #     'n_total': n_total,
        #     'n_remaining': n_remaining,
        #     'filtered_assembly_ref': new_ref
        # }
        #END run_yangdar1en_ContigFilterDemo_max

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_yangdar1en_ContigFilterDemo_max return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #18
0
    def run_kb_ReadSim(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "Inparams" -> structure: parameter
           "workspace_name" of String, parameter "input_sample_set" of
           String, parameter "strain_info" of String, parameter
           "assembly_or_genome_ref" of String, parameter "base_error_rate" of
           String, parameter "outer_distance" of String, parameter
           "standard_deviation" of String, parameter "num_read_pairs" of
           String, parameter "len_first_read" of String, parameter
           "len_second_read" of String, parameter "mutation_rate" of String,
           parameter "frac_indels" of String, parameter
           "variation_object_name" of String, parameter "output_read_object"
           of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_ReadSim
        output_dir = self.shared_folder
        print(params)
        self.su.validate_simreads_params(params)

        genome_or_assembly_ref = params['assembly_or_genome_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        self.du.download_genome(assembly_ref, output_dir)

        ref_genome = os.path.join(self.shared_folder, "ref_genome.fa")
        output_fwd_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed1.fq")
        output_rev_paired_file_path = os.path.join(self.shared_folder,
                                                   "raed2.fq")

        self.eu.check_path_exists(ref_genome)

        self.su.simreads(ref_genome, output_fwd_paired_file_path,
                         output_rev_paired_file_path, params)
        self.eu.check_path_exists(output_fwd_paired_file_path)
        self.eu.check_path_exists(output_rev_paired_file_path)

        retVal = self.ru.upload_reads({
            'wsname': params['workspace_name'],
            'name': params['output_read_object'],
            'sequencing_tech': 'illumina',
            'fwd_file': output_fwd_paired_file_path,
            'rev_file': output_rev_paired_file_path
        })

        logfile = os.path.join(self.shared_folder, "variant.txt")
        self.eu.check_path_exists(logfile)

        vcf_file = self.su.format_vcf(logfile)
        self.eu.check_path_exists(vcf_file)

        save_variation_params = {
            'workspace_name': params['workspace_name'],
            'genome_or_assembly_ref': params['assembly_or_genome_ref'],
            'sample_set_ref': params['input_sample_set'],
            'sample_attribute_name': 'sample_attr',
            'vcf_staging_file_path': vcf_file,
            'variation_object_name': params['variation_object_name']
        }
        self.vu.save_variation_from_vcf(save_variation_params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': 'Success'
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_kb_ReadSim

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_ReadSim return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "name_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long, parameter
           "prevalidate" of Long, parameter "incl_input_in_output" of Long,
           parameter "ignore_warnings" of Long, parameter
           "keep_existing_samples" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # Check if we have an existing Sample Set as input
        # if so, download
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            if params.get('set_name'):
                set_name = params.get('set_name')
            else:
                set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = find_header_row(params.get('sample_file'),
                                               params.get('file_format'))

        username = ctx['user_id']

        if str(params.get('file_format')).lower() not in [
                'enigma', 'sesar', 'kbase'
        ]:
            raise ValueError(
                f"Only SESAR, ENIGMA, and KBase formats are currently supported for importing samples. "
                f"File of format {params.get('file_format')} not supported.")
        mappings = {
            'enigma': ENIGMA_mappings,
            'sesar': SESAR_mappings,
            'kbase': {}
        }

        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            username, ctx['token'],
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex', []), sample_set, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        file_links = []
        new_data_links = []
        sample_set_ref = None

        # create UI to display the errors clearly
        html_link = _error_ui(errors, sample_data_json, has_unignored_errors,
                              self.scratch)

        if not has_unignored_errors:
            # only save object if there are no errors
            obj_info = self.dfu.save_objects({
                'id':
                save_ws_id,
                'objects': [{
                    "name": set_name,
                    "type": "KBaseSets.SampleSet",
                    "data": sample_set
                }]
            })[0]

            sample_set_ref = '/'.join(
                [str(obj_info[6]),
                 str(obj_info[0]),
                 str(obj_info[4])])
            sample_file_name = os.path.basename(
                params['sample_file']).split('.')[0] + '_OTU'

            # create a data link between each sample and the sampleset
            ss = SampleService(self.sample_url)
            for idx, sample_info in enumerate(sample_set['samples']):
                sample_id = sample_info['id']
                version = sample_info['version']
                sample = ss.get_sample({
                    'id': sample_id,
                    'version': version,
                })
                ret = ss.create_data_link(
                    dict(
                        upa=sample_set_ref,
                        id=sample_id,
                        dataid='samples/{}'.format(idx),
                        version=version,
                        node=sample['node_tree'][0]['id'],
                        update=1,
                    ))
                new_data_links.append(ret)

            # -- Format outputs below --
            # if output file format specified, add one to output
            if params.get('output_format') in ['csv', 'xls']:
                otu_path = sample_set_to_OTU_sheet(sample_set,
                                                   sample_file_name,
                                                   self.scratch, params)
                file_links.append({
                    'path':
                    otu_path,
                    'name':
                    os.path.basename(otu_path),
                    'label':
                    "OTU template file",
                    'description':
                    "file with each column containing the assigned sample_id and sample "
                    "name of each saved sample. Intended for uploading OTU data."
                })

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"Input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_data = {
            'report_object_name':
            "SampleSet_import_report_" + str(uuid.uuid4()),
            'workspace_name': params['workspace_name']
        }
        if file_links:
            report_data['file_links'] = file_links
        if sample_set_ref:
            report_data[
                'message'] = f"SampleSet object named \"{set_name}\" imported."
            report_data['objects_created'] = [{'ref': sample_set_ref}]

        if html_link:
            report_data['html_links'] = [{
                'path':
                html_link,
                'name':
                'index.html',
                'description':
                'HTML Report for Sample Uploader'
            }]
            report_data['direct_html_link_index'] = 0
        report_info = report_client.create_extended_report(report_data)
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref,
            'errors': errors,
            'links': new_data_links
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #20
0
    def run_omreegalozpathway_completeness(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_omreegalozpathway_completeness

        #Preparing report client
        report_client = KBaseReport(self.callback_url)

        #Original report info
        report_info = report_client.create({
            'report': {
                'objects_created': [],
                'text_message': params['main_input_ref']
            },
            'workspace_name':
            params['workspace_name']
        })

        token = os.environ.get('KB_AUTH_TOKEN', None)

        #Checking the input params
        if "main_input_ref" in params:
            main_input_ref = params['main_input_ref']
        else:
            logging.info(
                'the reference number is not in the params, program must end.')
            raise Exception("main_input_ref not in params")

        #Creating the workspace client object
        ws = Workspace(self.ws_url, token=token)

        #Getting information about the main input ref
        obj_info = ws.get_object_info3({'objects': [{'ref': main_input_ref}]})

        #Catching errors:
        if "infos" in obj_info:
            #Getting information from object reference number
            object_name = obj_info["infos"][0][1]
            object_type = obj_info["infos"][0][2]
            ws_name = obj_info["infos"][0][7]

            #Logging:
            logging.debug("Object Type: " + object_type)
            logging.debug("Object Name: " + object_name)
            logging.debug("Workspace Name: " + ws_name)
        else:
            logging.info(
                "The function ws.get_object_info3 failed to download the right information. The program must abort."
            )
            raise Exception("Could not find infos in obj_info")

        #We create the output file name and add information to it later.
        output_file_name = 'pathways_measurements'

        #This part is a hack, need to check type of data more accurately.
        if object_type[:17] == 'KBaseFBA.FBAModel':
            logging.info("Succesfully recognized type as FBA Model")

            #Preparing the output file name which we return to the user
            output_file_name += '_fba_model'

            #Creating an fba tools object
            fba_t = fba_tools(self.callback_url)

            # Getting the TSV file from the object
            X = fba_t.export_model_as_tsv_file({"input_ref": main_input_ref})

            # Logging
            logging.info(
                "the object output from fba tools export model as tsv file:")
            logging.info(X)

            #Locating where the reactions tsv was placed (Not well done- replace this with a robust form)
            reactions_file_path = os.path.join(
                self.shared_folder,
                object_name + '/' + object_name + '-reactions.tsv')

            #Preparing an output path for a future function
            output_path = os.path.join(self.shared_folder,
                                       output_file_name + '.tsv')

            #This function performs the percentage calculation work for FBAModel Object Types.
            html_path = reactions_file_to_pathway_reactions_and_percentages(
                reactions_file_path, output_path, object_name)

        # Using KBase Gene Families- Domain Annotation
        elif object_type[:34] == "KBaseGeneFamilies.DomainAnnotation":
            logging.info("Succesfully recognized type as Domain Annotation")
            output_file_name += '_domain_annotation'

            #We get the object using workspace's get_objects2 function
            obj = ws.get_objects2({'objects': [{'ref': main_input_ref}]})

            #Within the way the object dictionary is given, what we are looking for is in the location as follows:
            Y = obj['data'][0]['data']['data']

            #Preparing our own output_file_path with Domain Annotation instead of FBAModel (why?)
            output_file_path = os.path.join(self.shared_folder,
                                            output_file_name + '.tsv')

            #This function (written for the module) finds percentages of pathway completeness.
            html_path = TIGRFAM_file_to_pathway_reactions_and_percentages(
                Y, output_file_path, object_name)

        else:
            logging.info("Object type unknown")
            raise Exception(
                "Could not recognize ref to object- Check if object is FBA Model or Domain Annotation type. If so, the error is in the program, not the input - contact [email protected]."
            )

        html_dict = [{"path": html_path, "name": 'Completeness_Table'}]

        #Preparing final report:
        report = report_client.create_extended_report({
            'direct_html_link_index':
            0,
            'message':
            'Here are the pathway completeness results',
            'workspace_name':
            ws_name,
            'html_links':
            html_dict
        })

        output = {
            'report_name': report['name'],
            'report_ref': report['ref'],
        }
        #END run_omreegalozpathway_completeness

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method run_omreegalozpathway_completeness return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]
예제 #21
0
    def run_CompMolNWChem(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_CompMolNWChem

        # Initial Tests to Check for Proper Inputs

        for name in ['Input_File','calculation_type','workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '"is required but missing')
        if not isinstance(params['Input_File'], str):
            raise ValueError('Input_File must be a string')

        
        # Load the tsv file into a compound set using DataFileUtil methods
        
        scratch_file_path = self.dfu.download_staging_file({'staging_file_subdir_path':params['Input_File']}
                                       ).get('copy_file_path')

        #print('Scratch File Path: ',scratch_file_path)

        mol2_file_dir = None        
        ext = os.path.splitext(scratch_file_path)[1]
        file_name = os.path.basename(scratch_file_path)
        if ext == '.sdf':
            compounds = parse.read_sdf(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        elif ext == '.tsv':
            compounds = parse.read_tsv(scratch_file_path,
                                       mol2_file_dir=mol2_file_dir,
                                       callback_url=self.callback_url)
        #elif ext == '.csv':
        #    compounds = parse.read_csv(scratch_file_path,
        #                               mol2_file_dir=mol2_file_dir,
        #                               callback_url=self.callback_url)
        #else:
        #    raise ValueError('Invalid input file type. Expects .tsv or .sdf')

        #DEBUG::
        #print('Compounds:',compounds)

#        compoundset = {
#            'id': params['Input_File'],
#            'name': params['Input_File'],
#            'description': 'Compound Set produced from %s' % file_name,
#            'compounds': compounds,
#        }

        # Finish Reading in Compound Set

        # Read ids and smiles from compound set for nwchem input
        
#        ids = []
#        smiles = []

#        for d in compounds:
#           ids.append(d['id'])
#           smiles.append(d['smiles'])
        #print(ids)
        #print(smiles)
        

        
        # Read the ids and structures of the compounds
        
#        its.inchi_to_dft(ids,smiles)

        #DEBUG::
        #os.system('pwd')
        #os.system('ls')
        
#        length = len(ids)
#        for i in range(length):
#            os.chdir('./'+ids[i]+'/dft')
#            x = ids[i] + '_nwchem.out'
            #print('x:',x)
#            file1 = open(x, 'r')
#            nAtoms = mul.getNumberOfAtoms(file1)
#            energy = mul.getInternalEnergy0K(file1)
#            charge =mul.getMullikenCharge(file1,nAtoms)
#            file1.close()
           
#            mul.nAtoms = nAtoms
#            mul.E0K = energy

#            mul.calculate(ids[i])

       
        from snakemake import snakemake

        reactionlist = scratch_file_path

        id_to_smiles = {}
        data = open('/kb/module/modelseed_test.csv','r')

        for lines in data.readlines():
            id = lines.split(',')[0]
            smiles = lines.split(',')[1].rstrip()
            id_to_smiles[id] = smiles

        data.close()

        with open(reactionlist,'r') as f:
            reactions = f.readlines()[0].rstrip()
            reactant = reactions.split('=')[0].split('+')
            product = reactions.split('=')[1].split('+')
            metabolites = []
            for each in reactant:
                each = each.strip()
                metabolites.append(each)
            for each in product:
                each = each.strip()
                metabolites.append(each)

            for molecule in metabolites:
                
                moldir = molecule
                if not os.path.exists(moldir):
                    os.mkdir(moldir)
    
                initial_structure_dir = moldir + '/initial_structure'
                if not os.path.exists(initial_structure_dir):
                    os.mkdir(initial_structure_dir)

                md_structure_dir = moldir + '/md'
                if not os.path.exists(md_structure_dir):
                    os.mkdir(md_structure_dir)

                dft_structure_dir = moldir + '/dft'
                if not os.path.exists(dft_structure_dir):
                    os.mkdir(dft_structure_dir)

                inchifile_str = initial_structure_dir + '/' + moldir + '.smiles'
                with open(inchifile_str,'w+') as f:
                    f.write(id_to_smiles[moldir])
        
        os.system('snakemake -p --cores 3 --snakefile snakemake-scripts/final_pipeline.snakemake -w 12000')

        # Build KBase Output. Should output entire /simulation directory and build a CompoundSet with Mol2 Files

        #result_directory = '/kb/module/snakemake-scripts'
        result_directory = '/kb/module/'

        ## Build CompoundSet with Mol2 Files... similarly to fetch_mol2_files_from_zinc (CompoundSetUtils)....

#        compoundset_copy = copy.deepcopy(compoundset)

#        count = 0

#        for compound in compoundset_copy.get('compounds'):
#            if not compound.get('mol2_handle_ref'):
#                mol2_file_path = result_directory+compound.get('id')
#                SMILES = compound.get('smiles')

#                shutil.move(mol2_file_path,self.scratch)

#                os.chdir(self.scratch)
               
#                mol2_file_path = self.scratch + '/'+ compound.get('id')+'/dft/' + compound.get('id')+'_Mulliken.mol2'              
#                handle_id = self.dfu.file_to_shock({'file_path': mol2_file_path,
#                                                    'make_handle': True})['handle']['hid']
#                print('Handle ID:',handle_id)
#                compound['mol2_handle_ref'] = handle_id
#                count += 1

               
               
#        if count:
#            message = 'Successfully fetched {} Mol2 files from Staging Path'.format(count)


        ## Create Extended Report
        
        output_files = self._generate_output_file_list(result_directory)
        #output_files = 
        
        report_params = {
            'message':'',
            'workspace_id': params['workspace_id'],
            'objects_created': [],
            'file_links':output_files,
            'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())}

        report = KBaseReport(self.callback_url)
        
        report_info = report.create_extended_report(report_params)

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }

        return [output]
예제 #22
0
    def _save_output_to_kbase(self, io_params, app_params, output_dir, run_log, run_command):
        # TODO: insert the run_command into the output log
        #
        # read the output file list
        file_lookup = self._read_outputfile(os.path.join(output_dir, 'file-list.txt'))

        # save the new reads
        mapped_reads_ref = None
        unmapped_reads_ref = None
        objects_created = []
        if 'mapped_reads_files' not in file_lookup:
            print('No mapped reads fastq file found in output.  Not creating any mapped reads objects.')
        else:
            for file_name in file_lookup['mapped_reads_files'].split(','):
                mapped_reads_path = os.path.join(output_dir, file_name)
                mapped_reads_ref = upload_interleaved_reads(
                    self.callback_url,
                    mapped_reads_path,
                    io_params['workspace_name'],
                    file_name+'.reads',
                    io_params.get('in_readslib_ref'))
                objects_created.append({
                    'ref': mapped_reads_ref,
                    'description': 'Mapped reads library'
                })
        if 'unmapped_reads_files' not in file_lookup:
            print('No unmapped reads fastq file found in output.  Not creating any unmapped reads objects.')
        else:
            for file_name in file_lookup['unmapped_reads_files'].split(','):
                unmapped_reads_path = os.path.join(output_dir, file_name)
                unmapped_reads_ref = upload_interleaved_reads(
                    self.callback_url,
                    unmapped_reads_path,
                    io_params['workspace_name'],
                    file_name+'.reads',
                    io_params.get('in_readslib_ref'))
                objects_created.append({
                    'ref': unmapped_reads_ref,
                    'description': 'Unmapped reads library'
                })

        # build the HTML report
        html_zipped = self._build_html_report(io_params.get('in_readslib_ref'), output_dir, file_lookup)
        file_links = self._build_file_report(output_dir, run_log)
        # save the report
        report_params = {
            'message': '',
            'objects_created': objects_created,
            'direct_html_link_index': 0,
            'html_links': [html_zipped],
            'file_links': file_links,
            'report_object_name': 'bbtools_bbmap_report_' + str(uuid.uuid4()),
            'workspace_name': io_params['workspace_name']
        }

        kr = KBaseReport(self.callback_url)
        report_output = kr.create_extended_report(report_params)

        return {'report_name': report_output['name'],
                'report_ref': report_output['ref'],
                'run_command': run_command}
예제 #23
0
def generate_report(callback_url, token, workspace_name, shared_folder: Path,
                    virmatcher_output: Path):
    """
    :param callback_url:
    :param token: Job token
    :param workspace_name: Workspace name
    :param shared_folder: KBase working directory on the node, used to save the HTML file
    :param virmatcher_output: VirMatcher proper final results directory, should have the summary file
    :return:
    """
    html_template = Template("""<!DOCTYPE html>
    <html lang="en">
      <head>

        <link href="https://netdna.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet">
        <link href="https://cdn.datatables.net/1.10.22/css/jquery.dataTables.min.css" rel="stylesheet">
        <link href="https://cdn.datatables.net/buttons/1.5.2/css/buttons.dataTables.min.css" rel="stylesheet">

        <link href="https://cdn.datatables.net/searchpanes/1.2.0/css/searchPanes.dataTables.min.css" rel="stylesheet">
        <link href="https://cdn.datatables.net/select/1.3.1/css/select.dataTables.min.css" rel="stylesheet">

        <script src="https://code.jquery.com/jquery-3.5.1.js" type="text/javascript"></script>
        <script src="https://cdn.datatables.net/1.10.22/js/jquery.dataTables.min.js" type="text/javascript"></script>
        <script src="https://cdn.datatables.net/buttons/1.6.4/js/dataTables.buttons.min.js" type="text/javascript"></script>
        <script src="https://cdn.datatables.net/buttons/1.6.4/js/buttons.flash.min.js" type="text/javascript"></script>
        <script src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js" type="text/javascript"></script>
        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/pdfmake.min.js" type="text/javascript"></script>
        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/vfs_fonts.js" type="text/javascript"></script>
        <script src="https://cdn.datatables.net/buttons/1.6.4/js/buttons.html5.min.js" type="text/javascript"></script>
        <script src="https://cdn.datatables.net/buttons/1.6.4/js/buttons.print.min.js" type="text/javascript"></script>

        <script src="https://cdn.datatables.net/searchpanes/1.2.0/js/dataTables.searchPanes.min.js" type="text/javascript"></script>
        <script src="https://cdn.datatables.net/select/1.3.1/js/dataTables.select.min.js" type="text/javascript"></script>

        <style>
        tfoot input {
            width: 100%;
            padding: 3px;
            box-sizing: border-box;
        }
        </style>

      </head>

      <body>

        <div class="container">
          <div>
            ${html_table}
          </div>
        </div>

        <script type="text/javascript">
          $$(document).ready(function() {
            $$('#my_id tfoot th').each( function () {
              var title = $$(this).text();
              $$(this).html( '<input type="text" placeholder="Search '+title+'" />' );
            });

            var table = $$('#my_id').DataTable({
              buttons: [
                'copy', 'csv', 'excel', 'pdf', 'print'],
              scrollX: true,
              dom: 'lBfrtip'  //P to B disables panes
            });

            table.columns().every( function () {
              var that = this;

              $$( 'input', this.footer() ).on( 'keyup change', function () {
                if ( that.search() !== this.value ) {
                  that
                  .search( this.value )
                  .draw();
                }
              });
            } );
          } );
        </script>

      </body>
    </html>""")

    report = KBaseReport(callback_url, token=token)
    dfu = DataFileUtil(callback_url, token=token)

    virmatcher_fp = virmatcher_output / 'VirMatcher_Summary_Predictions.tsv'

    virmatcher_df = pd.read_csv(virmatcher_fp,
                                header=0,
                                index_col=None,
                                delimiter='\t')

    # Set column ordering
    order = [
        'Original Viral population',
        'Original Host',
        'Final_score',
        'CRISPR score',
        'Prophage blast score',
        'WIsH score',
        'Number of CRISPR matches',
        'Max number of end mismatches detected in any CRISPR spacer',
        'Prophage blast percent identity',
        'Prophage blast viral contig coverage',
        'tRNA match',
        'Max number of end mismatches detected in host tRNA',
        'Non-promiscuous tRNA match score',
        'WIsH p-value',
        # 'LogLikelihood',
        'Viral population',
        'Predicted host',
    ]
    order = [ele for ele in order
             if ele in virmatcher_df.columns.tolist()]  # Some may not exist!
    virmatcher_df = virmatcher_df[order]

    html = virmatcher_df.to_html(index=False,
                                 classes='my_class table-striped" id = "my_id')

    # Need to file write below
    direct_html = html_template.substitute(html_table=html)

    # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
    start_header = Literal("<thead>")
    end_header = Literal("</thead>")

    text = start_header + SkipTo(end_header)

    new_text = ''
    for data, start_pos, end_pos in text.scanString(direct_html):
        new_text = ''.join(data).replace(
            ' style="text-align: right;"', '').replace(
                'thead>', 'tfoot>\n  ') + '\n</tfoot>'

    # Get start and end positions to insert new text
    end_tbody = Literal("</tbody>")
    end_table = Literal("</table>")

    insertion_pos = end_tbody + SkipTo(end_table)

    final_html = ''
    for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
        final_html = direct_html[:start_pos +
                                 8] + '\n' + new_text + direct_html[start_pos +
                                                                    8:]

    output_dir = shared_folder / str(uuid.uuid4())

    os.mkdir(output_dir)

    html_fp = output_dir / 'index.html'

    with open(html_fp, 'w') as html_fh:
        html_fh.write(final_html)

    report_shock_id = dfu.file_to_shock({
        'file_path': str(output_dir),
        'pack': 'zip'
    })['shock_id']

    html_report = [{
        'shock_id': report_shock_id,
        'name': 'index.html',
        'label': 'index.html',
        'description': 'Summary report for VirMatcher'
    }]

    report_params = {
        'message': 'Basic message to show in the report',
        'workspace_name': workspace_name,
        'html_links': html_report,
        'direct_html_link_index': 0,
        'report_object_name': f'VirMatcher_report_{str(uuid.uuid4())}',
    }

    report_output = report.create_extended_report(report_params)

    return report_output
예제 #24
0
    def run_simplebatch(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "SimpleBatchParams" -> structure:
           parameter "batch_inputs" of type "batch_params" -> list of type
           "app_params" -> mapping from String to unspecified object,
           parameter "method_name" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_simplebatch
        report = KBaseReport(self.callback_url)

        #TODO Always request WSID?
        #"simpleapp.simple_add"
        method_name = "simpleapp.simple_add"  #params['method_name']
        wsid = "TODO"
        #TODO Get Service_Ver
        service_ver = "dev"
        batched_app_params = params['app_params']

        job_ids = []
        statuses = []

        for i, app_param in enumerate(batched_app_params):
            print(f"About to submit job with params {app_param}")
            rjp = {
                "method": method_name,
                "params": [app_param],
                "service_ver": service_ver,
                "wsid": wsid,
                "app_id": "RanWithBatch",
            }
            try:
                job_id = self.ee2.run_job(params=rjp)
                status = "queued"
            except Exception:
                job_id = "failed to submit"
                status = "failure"

            job_ids.append(job_id)
            statuses.append(status)

        #TODO Create table with refresh buttons or autorefresh, which uses cookie or environment
        # Send this as a report

        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': params['parameter_1']
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_simplebatch

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_simplebatch return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #25
0
  def run_MotifSuite(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "motifsuite_seq_input" -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "SS_ref" of String, parameter "promoter_length"
           of Long, parameter "motif_min_length" of Long, parameter
           "motif_max_length" of Long, parameter "obj_name" of String,
           parameter "prb" of Double, parameter "motif_length" of Long,
           parameter "background" of Long, parameter "mask_repeats" of Long,
           parameter "background_group" of mapping from String to String,
           parameter "threshold" of Double, parameter "proportion" of Double
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_MotifSuite
        report = KBaseReport(self.callback_url)
        mfmd_obj = MotifFindermfmd(self.callback_url)
        homer_obj = MotifFinderHomer(self.callback_url)
        meme_obj =  MotifFinderMEME(self.callback_url)
        gibbs_obj = MotifFinderGibbs(self.callback_url)
        ensemble_obj = MotifEnsemble(self.callback_url)
        mdscan_obj = MotifFinderMdscan(self.callback_url)
        sampler_obj =  MotifFinderSampler(self.callback_url)
        
        p1 = Process(target=homer_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p1.start()
        p1.join()

        p2 = Process(target=mfmd_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p2.start()
        p2.join()

        p3 = Process(target=meme_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p3.start()
        p3.join()
        
        p4 = Process(target=gibbs_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p4.start()
        p4.join()

        p5 = Process(target=mdscan_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p5.start()
        p5.join()

        p6 = Process(target=sampler_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p6.start()
        p6.join()
 
        
        MSU=MotifSuiteUtil()
        params['motifset_refs']= MSU.get_obj_refs()
        #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133','29716/72/134','29716/72/135','29716/72/136']
        #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133']
        print(params['motifset_refs'])
        #result = ensemble_obj.MotifEnsemble(params)
        #print('Ensemble RESULT:')
        #print(result)


        dms=DownloadMotifSets()
        MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])
        fmu=FastaUtils()
        for i,MSR1 in enumerate(MotifSetDict.keys()):
            for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k,MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']):
                            if fmu.CompareMotifsBP(motif1,motif2,threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m,mset in enumerate(matchSets):
                                    if (MSR1,j) in mset:
                                        found1 = True
                                        index1 = m
                                    if(MSR2,l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1,j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2,l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(set([(MSR1,j),(MSR2,l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i,mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys()))/numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))

        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = fmu.merge(matchSets[keep],MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))


        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        mr=MakeNewReport()
        mr.MakeReport(htmlDir,ESO)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')


        reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportObj)
        output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        
        #END run_MotifSuite

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_MotifSuite return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #26
0
    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "id_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # We subtract by 1 for zero indexing.
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = 0
            if params.get('file_format') == "SESAR":
                header_row_index = 1

        username = ctx['user_id']

        if params.get('file_format') == 'ENIGMA':
            # ENIGMA_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']}
            # )
            sample_set = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], ENIGMA_mappings['column_mapping'],
                ENIGMA_mappings.get('groups',
                                    []), ENIGMA_mappings['date_columns'],
                ENIGMA_mappings.get('column_unit_regex',
                                    []), sample_set, header_row_index)
        elif params.get('file_format') == 'SESAR':
            # SESAR_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in SESAR_mappings['basic_columns']}
            # )
            sample_set = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], SESAR_mappings['column_mapping'],
                SESAR_mappings.get('groups',
                                   []), SESAR_mappings['date_columns'],
                SESAR_mappings.get('column_unit_regex',
                                   []), sample_set, header_row_index)
        elif params.get('file_format') == 'KBASE':
            sample_set = import_samples_from_file(params, self.sw_url,
                                                  self.workspace_url, username,
                                                  ctx['token'], {}, [], [], [],
                                                  sample_set, header_row_index)
        else:
            raise ValueError(
                f"Only SESAR and ENIGMA formats are currently supported for importing samples. "
                "File of format {params.get('file_format')} not supported.")

        obj_info = self.dfu.save_objects({
            'id':
            save_ws_id,
            'objects': [{
                "name": set_name,
                "type": "KBaseSets.SampleSet",
                "data": sample_set
            }]
        })[0]

        sample_set_ref = '/'.join(
            [str(obj_info[6]),
             str(obj_info[0]),
             str(obj_info[4])])
        sample_file_name = os.path.basename(
            params['sample_file']).split('.')[0] + '_OTU'

        # -- Format outputs below --
        # if output file format specified, add one to output
        if params.get('output_format') in ['csv', 'xls']:
            otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name,
                                               self.scratch, params)
            file_links = [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "OTU template file",
                'description':
                "file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }]
        else:
            file_links = []

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_name = "SampleSet_import_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'message':
            f"SampleSet object named \"{set_name}\" imported.",
            'objects_created': [{
                'ref': sample_set_ref
            }],
            'file_links':
            file_links,
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class CompareAnnotationsUtil:

    workdir = 'tmp/work/'
    staging_dir = "/staging/"
    datadir = "/kb/module/data/"

    def __init__(self, config):
        os.makedirs(self.workdir, exist_ok=True)
        self.config = config
        self.timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.ws_client = Workspace(config["workspace-url"])

        self.events = {}

    def get_ontology_events(self, params):

        if 'ontology_events' in self.genome:
            for event, ontology in enumerate(self.genome['ontology_events']):
                if 'description' not in ontology:
                    ontology['description'] = ontology['method']
                if ontology['description'] in params['annotations_to_compare'] or len(params['annotations_to_compare']) == 0:
                    self.events[event] = {}

                    ontology["id"] = mu.legacy_fix(ontology["id"])

                    for term in ontology:
                        self.events[event][term] = ontology[term]
        else:
            logging.info("No ontology events in this genome!")

        # logging.info(self.events)

    def summarize_gto(self, params):
        summary = {"genes": {},
                   "terms": {},
                   "rxns": {},
                   "ontology_events": {},
                   "orphan_terms": {}
                   }

        # add ontology events
        for ontology_event in self.events:
            summary['ontology_events'][ontology_event] = self.events[ontology_event]

        # add gene id to summary
        for feature in self.genome['features']:
            gene_id = feature['id']
            summary["genes"][gene_id] = {"terms": {},
                                         "rxns": {}
                                         }

            # get ontology term
            if "ontology_terms" in feature:

                for ontology_term_type in feature['ontology_terms']:
                    '''
                    note, ontology_term_type might be a legacy term, and will need to be converted later after making the term_dict
                    '''

                    # ontology_term_type = ontology_term_type.upper()
                    # logging.info(ontology_term_type)
                    # if ontology_term_type in mu.legacy_codes:
                    #     ontology_term_type = mu.legacy_codes[ontology_term_type]
                    # logging.info(ontology_term_type)

                    term_dict = feature['ontology_terms'][ontology_term_type]

                    for term in term_dict:
                        for ontology_event in term_dict[term]:

                            # is this ontology event in the user-selected list?
                            if ontology_event in self.events:

                                rxn = "none"

                                # get rxn, convert to upper case to make case insensitive
                                ontology_type = summary['ontology_events'][ontology_event]['id'].upper(
                                )

                                # fix annotation term to fit with style

                                term = mu.standardize_annotation(term, ontology_type)

                                # convert terms to rxns
                                if term in self.translations[ontology_type]:
                                    rxn = self.translations[ontology_type][term]
                                else:
                                    if ontology_event in summary["orphan_terms"]:
                                        summary["orphan_terms"][ontology_event].append(term)
                                        summary["orphan_terms"][ontology_event] = list(
                                            set(summary["orphan_terms"][ontology_event]))
                                    else:
                                        summary["orphan_terms"][ontology_event] = [term]

                                # terms
                                if term in summary["genes"][gene_id]['terms']:
                                    summary["genes"][gene_id]['terms'][term].append(ontology_event)
                                else:
                                    summary["genes"][gene_id]['terms'][term] = [ontology_event]

                                if term in summary['terms']:
                                    summary['terms'][term].append(ontology_event)
                                    summary['terms'][term] = list(set(summary['terms'][term]))
                                else:
                                    summary['terms'][term] = [ontology_event]

                                # rxns
                                if rxn != "none":
                                    if rxn in summary["genes"][gene_id]['rxns']:
                                        summary["genes"][gene_id]['rxns'][rxn].append(
                                            ontology_event)
                                    else:
                                        summary["genes"][gene_id]['rxns'][rxn] = [ontology_event]

                                    if rxn in summary['rxns']:
                                        summary['rxns'][rxn].append(ontology_event)
                                        summary['rxns'][rxn] = list(set(summary['rxns'][rxn]))
                                    else:
                                        summary['rxns'][rxn] = [ontology_event]

        with open(os.path.join(self.scratch, "summary_dump.json"), 'w') as outfile:
            json.dump(summary, outfile, indent=2)

        return summary

    def html_summary(self, params, summary):

        # convert gto summary for this report
        html_summary_report = {}

        for ontology_event in summary['ontology_events']:
            html_summary_report[ontology_event] = {"gene": [], "term": [], "rxn": []}

        for gene in summary["genes"]:
            for term in summary["genes"][gene]['terms']:
                for ontology_event in summary["genes"][gene]['terms'][term]:
                    html_summary_report[ontology_event]['gene'].append(gene)
                    html_summary_report[ontology_event]['term'].append(term)

                    html_summary_report[ontology_event]['gene'] = list(
                        set(html_summary_report[ontology_event]['gene']))
                    html_summary_report[ontology_event]['term'] = list(
                        set(html_summary_report[ontology_event]['term']))

            for rxn in summary["genes"][gene]['rxns']:
                for ontology_event in summary["genes"][gene]['rxns'][rxn]:
                    html_summary_report[ontology_event]['rxn'].append(rxn)
                    html_summary_report[ontology_event]['gene'].append(gene)

                    html_summary_report[ontology_event]['rxn'] = list(
                        set(html_summary_report[ontology_event]['rxn']))
                    html_summary_report[ontology_event]['gene'] = list(
                        set(html_summary_report[ontology_event]['gene']))

        output_html_files = list()

        # Make report directory and copy over files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)
        result_file_path = os.path.join(output_directory, 'compare_annotations_summary.html')

        # make html
        table_lines = []
        table_lines.append(f'<h2>Compare Annotations</h2>')
        table_lines.append(f'<h3>Summary</h3>')
        table_lines.append(
            '<table cellspacing="0" cellpadding="3" border="1"><tr><th>EVENT</th><th>DESCRIPTION</th><th>TYPE</th><th>GENES</th><th>TERMS</th><th>RXNS</th></tr>')
        for event in sorted(html_summary_report.keys()):
            # RAST/PROKKA don't have descriptions, but they have methods
            description = self.events[event].get('description', self.events[event]['method'])
            type = self.events[event]['id']
            genes_list = html_summary_report[event]['gene']
            terms_list = html_summary_report[event]['term']
            rxns_list = html_summary_report[event]['rxn']
            table_lines.append('<tr><td>' + str(event) + '</td><td>' + description + '</td><td>' + type + '</td><td>' + str(
                len(set(genes_list))) + '</td><td>' + str(len(terms_list)) + '</td><td>' + str(len(rxns_list)) + '</td></tr>')
        table_lines.append('</table>')

        # Write to file
        with open(result_file_path, 'w') as result_file:
            for line in table_lines:
                result_file.write(line + "\n")

        output_html_files.append(
            {'path': output_directory,
             'name': os.path.basename(result_file_path),
             'description': 'Summary Report'})

        # bokeh plots
        totals_file_path = os.path.join(output_directory, 'totals.html')
        output_file(totals_file_path, title="Totals")
        totals = self.plot_totals(summary)
        save(totals)
        output_html_files.append(
            {'path': output_directory,
             'name': os.path.basename(totals_file_path),
             'description': 'Ontology Totals'})

        csc_file_path = os.path.join(output_directory, 'csc.html')
        output_file(csc_file_path, title="CSC")
        csc = self.plot_csc2(summary)
        save(csc)
        output_html_files.append(
            {'path': output_directory,
             'name': os.path.basename(csc_file_path),
             'description': 'Cumulative Sum Plot'})

        # finalize html reports
        report_params = {
            'message': '',
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'workspace_name': params['workspace_name'],
            'report_object_name': f'compare_annotations_{uuid.uuid4()}'}

        output = self.kbr.create_extended_report(report_params)

        return {'report_name': output['name'],
                'report_ref': output['ref']}

# plotting functions
    def plot_totals(self, summary):
        descriptions = {}
        for o in summary["ontology_events"]:
            descriptions[o] = summary["ontology_events"][o].get(
                'description', summary["ontology_events"][o]['method']) + '_' + str(o)
            logging.info(descriptions[o])

        totals = {}
        for event in summary['ontology_events'].keys():
            totals[str(event)] = {'genes': [],
                                  'rxns': [],
                                  'terms': []}

        # genes
        for gene in summary['genes']:
            for term in summary['genes'][gene]['terms']:
                for event in summary['genes'][gene]['terms'][term]:
                    totals[str(event)]['genes'].append(gene)

        # terms
        for term in summary['terms']:
            for event in summary['terms'][term]:
                totals[str(event)]['terms'].append(term)

        # rxns
        for rxn in summary['rxns']:
            for event in summary['rxns'][rxn]:
                totals[str(event)]['rxns'].append(rxn)

        # sums
        events = []
        types = ['genes', 'terms', 'rxns']

        gene_counts = []
        rxn_counts = []
        term_counts = []

        for event in totals:
            logging.info(event)
            events.append(descriptions[int(event)])
            gene_counts.append(len(set(totals[event]['genes'])))
            rxn_counts.append(len(set(totals[event]['rxns'])))
            term_counts.append(len(set(totals[event]['terms'])))

        data = {'events': events,
                'genes': gene_counts,
                'terms': term_counts,
                'rxns': rxn_counts
                }

        x = [(event, type) for event in events for type in types]

        counts = sum(zip(data['genes'], data['terms'], data['rxns']), ())
        source = ColumnDataSource(data=dict(x=x, counts=counts))

        p = figure(y_range=FactorRange(*x),
                   plot_height=400,
                   plot_width=1000,
                   title="Unique Counts per Annotation Event",
                   tools="wheel_zoom,box_zoom,reset,save")

        p.hbar(y='x',
               right='counts',
               height=0.9,
               source=source,
               line_color="black",
               fill_color=factor_cmap('x',
                                      palette=inferno(len(types)),
                                      factors=types,
                                      start=1,
                                      end=2))

        p.x_range.start = 0
        p.y_range.range_padding = 0.1
        p.yaxis.major_label_orientation = "horizontal"
        p.yaxis.subgroup_label_orientation = "horizontal"
        p.yaxis.group_label_orientation = "horizontal"
        p.ygrid.grid_line_color = None
        p.title.text_font_size = '12pt'
        p.xaxis.major_label_text_font_size = "12pt"
        p.yaxis.major_label_text_font_size = "12pt"
        p.yaxis.group_text_font_size = "12pt"
        p.add_tools(HoverTool(tooltips=[("Type", "@x"), ("Count", "@counts")]))

        return(p)

    def plot_csc2(self, summary, summary_type="rxns"):
        descriptions = {}
        for o in summary["ontology_events"]:
            descriptions[o] = summary["ontology_events"][o].get(
                'description', summary["ontology_events"][o]['method']) + ' (' + summary["ontology_events"][o]['id'] + ' #' + str(o) + ')'

        events = sorted(summary['ontology_events'].keys())
        rxns = summary[summary_type]

        # convert to sets
        rxns_in_events = dict((int(el), set()) for el in events)
        for rxn in rxns:
            for event in rxns[rxn]:
                rxns_in_events[event].add(rxn)

        winning_sets = {}
        winning_order = []
        baseline = 0
        df = pd.DataFrame(columns=["E", "C", "T", "L", "R"])
        # E=event, C=comparison, T=total, L=left, R=right

        for _ in range(len(rxns_in_events)):

            current_right = baseline
            current_left = baseline

            # get current winner
            longest_set_key = self.longest_set(rxns_in_events, winning_sets)

            # compare current winner to all past winners
            current = rxns_in_events[longest_set_key]
            for past_winner in winning_order:
                overlap = len(winning_sets[past_winner] & current)
                current_left -= overlap
                row = [descriptions[longest_set_key],  # E
                       descriptions[past_winner],  # C
                       overlap,  # T
                       current_left,  # L
                       current_left + overlap]  # R
                df.loc[len(df)] = row
                current = current - winning_sets[past_winner]

            # process current winner
            row = [descriptions[longest_set_key],  # E
                   descriptions[longest_set_key],  # C
                   len(current),  # T
                   current_right,  # L
                   current_right + len(current)]  # R

            df.loc[len(df)] = row  # add to df
            baseline += len(current)

            # move current winner to past winners
            winning_sets[longest_set_key] = rxns_in_events[longest_set_key]
            winning_order.append(longest_set_key)
            rxns_in_events[longest_set_key] = set()

        source = ColumnDataSource(df)

        type1_colormap = factor_cmap('E', palette=viridis(
            len(df.E.unique())), factors=df.E.unique())
        type2_colormap = factor_cmap('C', palette=viridis(
            len(df.C.unique())), factors=df.C.unique())

        p = figure(y_range=df.E.unique().tolist()[::-1],  # .tolist()[::-1] reverses the list.
                   plot_height=300,
                   plot_width=1000,
                   title="Annotation events ranked by \'" + str(summary_type) + "\' contribution",
                   tools="wheel_zoom,box_zoom,reset,save")

        p.hbar(y='E',
               height=0.9,
               left='L',
               right='R',
               source=source,
               fill_color=type2_colormap,
               line_color="black")

        p.add_tools(HoverTool(tooltips=[("Total", "@T"), ("Comparison", "@C")]))
        p.title.text_font_size = '12pt'
        p.xaxis.major_label_text_font_size = "12pt"
        p.yaxis.major_label_text_font_size = "12pt"
        return p

    def longest_set(self, s, w):
        s = s.copy()
        for event in s:
            for winner in w:
                s[event] = s[event] - w[winner]

        # https://stackoverflow.com/a/21839239
        max_key, max_value = max(s.items(), key=lambda x: len(x[1]))
        return(max_key)

    def run(self, ctx, params):

        # collect some metadata
        self.genome = mu.get_genome(params['genome'], self.genome_api)

        self.get_ontology_events(params)
        self.translations = mu.get_translations(self.datadir)

        # summarize and make reports
        summary = self.summarize_gto(params)
        report = self.html_summary(params, summary)
        return report
예제 #28
0
    def process_batch_result(self, batch_result, validated_params, reads,
                             input_set_info):

        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        # reads alignment set items
        items = []
        objects_created = []

        for k in range(0, len(batch_result['results'])):
            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                # Note: could add a label to the alignment here?
                items.append({'ref': ra_ref, 'label': reads[k]['condition']})
                objects_created.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data':
            alignment_set_data,
            'workspace':
            validated_params['output_workspace'],
            'output_object_name':
            str(input_set_info[1]) + validated_params['output_obj_name_suffix']
        }

        set_api = SetAPI(self.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        print('Saved ReadsAlignment=')
        pprint(save_result)
        objects_created.append({
            'ref':
            save_result['set_ref'],
            'description':
            'Set of all reads alignments generated'
        })
        set_name = save_result['set_info'][1]

        # run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': save_result['set_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        print('Report text=')
        print(report_text)

        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created':
            objects_created,
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })

        result = {
            'report_info': {
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        }
        result['batch_output_info'] = batch_result

        return result
예제 #29
0
    def run_MotifSuite(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_MotifSuite
       
        report = KBaseReport(self.callback_url)
        mfmd_obj = MotifFindermfmd(self.callback_url)
        homer_obj = MotifFinderHomer(self.callback_url)
        meme_obj =  MotifFinderMEME(self.callback_url)
        gibbs_obj = MotifFinderGibbs(self.callback_url)
        ensemble_obj = MotifEnsemble(self.callback_url)

        '''result = homer_obj.DiscoverMotifsFromSequenceSet(params)
        print('Homer RESULT:')
        pprint(result)'''
     
        '''if os.path.exists('/kb/module/work/homer_out'):
           shutil.rmtree('/kb/module/work/homer_out')
        shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/homer_out/')
        
        result = meme_obj.DiscoverMotifsFromSequenceSet(params)
        print('MEME RESULT:')
        pprint(result)
        '''
        result = mfmd_obj.DiscoverMotifsFromSequenceSet(params)
        print('MFMD RESULT:')
        pprint(result)

        '''result = ensemble_obj.MotifEnsemble(params)
        print('Ensemble RESULT:')
        print(result)

        
        if os.path.exists('/kb/module/work/meme_out'):
           shutil.rmtree('/kb/module/work/meme_out')
        shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/meme_out/')

        result = gibbs_obj.ExtractPromotersFromFeatureSetandDiscoverMotifs(params)
        print('Gibbs RESULT:')
        pprint(result)
        if os.path.exists('/kb/module/work/gibbs_out'):
           shutil.rmtree('/kb/module/work/gibbs_out')
        shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/gibbs_out/')

        #fix issue for MotifFindermfmd in catalogue  
        result = mfmd_obj.DiscoverMotifsFromSequenceSet(params)
        print('MFMD RESULT:')
        pprint(result)
        
        MSU=MotifSuiteUtil()
        params['motifset_refs']= MSU.get_obj_refs()

        result = ensemble_obj.MotifEnsemble(params)
        print('Ensemble RESULT:')
        print(result)
        '''
    
        report_info = report.create({'report': {'objects_created':[],
                                                'text_message': params['workspace_name']},
                                                'workspace_name': params['workspace_name']})
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_MotifSuite

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_MotifSuite return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #30
0
    def run_bsadkhinContigFilter(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_bsadkhinContigFilter

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_bsadkhinContigFilter function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        logging.info('returning:' + pformat(output))

        #END run_bsadkhinContigFilter

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_bsadkhinContigFilter return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #31
0
    def MotifEnsemble(self, ctx, params):
        """
        :param params: instance of type "EnsembleParams" (Internal workflow:
           1. Input - list of motifsets , workspace, threshold consensus 2.
           Download MotifSets -> Utils function 3. Assign motif ids by
           position in list Use refs to identify MSOs internally! Dictionary
           of motifsets key: ref, val set list of match sets: each item in
           the set is a tuple of (ref,index) for each motifset: <- enumerate
           to avoid duplicate for each motif in motifset for each other
           motifset: <- enumerate to avoid duplicate for each motif in other:
           compare(motif1,motif2): if motifs same: search list of sets for
           motif1: if found add  motif2 if not in if not found search list of
           sets for motif2: if found add motif1 else add a new set with
           motif1 + motif2) -> structure: parameter "motifset_refs" of list
           of String, parameter "workspace_name" of String, parameter
           "threshold" of Double
        :returns: instance of type "Ensemble_out" -> structure: parameter
           "motifset_ref" of String
        """
        # ctx is the context object
        # return variables are: out
        #BEGIN MotifEnsemble
        #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.)

        MotifSetDict = DownloadMotifSet(params['motifset_refs'],self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])

        for i,MSR1 in enumerate(MotifSetDict.keys()):
            for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k,MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']):
                            if CompareMotifsBP(motif1,motif2,threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m,mset in enumerate(matchSets):
                                    if (MSR1,j) in mset:
                                        found1 = True
                                        index1 = m
                                    if(MSR2,l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1,j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2,l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(set([(MSR1,j),(MSR2,l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i,mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys()))/numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))


        #handle duplicates...
        #for i,tuple1 in enumerate(matchSets):
        #    for j,tuple2 in enumerate(matchSets):
        #        if j > i:
        #            if tuple1[0] == tuple2[0]:
                        #handle this....
                        #how...?
                        #merge locations if theyre different
                        #pick one motif by default(p-val)
                        #run motif compare to ensure theyre actually similar enough
        #                print('duplicate')

        #create new MSO
        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = merge(matchSets[keep],MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))


        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['id'] = params['workspace_name']
        save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #create report
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        MakeReport(htmlDir,ESO)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')



        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        #MSO = {}
        #MSO['Condition'] = 'Temp'
        #MSO['FeatureSet_ref'] = '123'
        #MSO['Motifs'] = []
        #MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = {}
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0

        #MSU.parseMotifList(fullMotifList,MSO)
        #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000))

        #Pass motif set into this
        #save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}]

        #info = dfu.save_objects(save_objects_params)[0]
        #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    #'name': 'promoter_download.zip',
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        #END MotifEnsemble

        # At some point might do deeper type checking...
        if not isinstance(out, dict):
            raise ValueError('Method MotifEnsemble return value ' +
                             'out is not type dict as required.')
        # return the results
        return [out]
예제 #32
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'
    JAVA_MEM_DEFAULT_SIZE = '16G'
    LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024  # 20 GB
    TIMEOUT = 72 * 60 * 60  # 72 hours

    def _get_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        print('File size: {} -- {}'.format(file_size, file_path))
        return file_size

    def _large_file(self, file_path):

        filename, file_extension = os.path.splitext(file_path)
        multiplier = 0

        if file_extension == '.txt':
            total_file_size = 0
            with open(file_path, 'r') as f:
                for line in f:
                    bam_file_path = line.split('\t')[1]
                    total_file_size += self._get_file_size(bam_file_path)
            print('Total file size: {}'.format(total_file_size))
            multiplier = int(total_file_size) // int(self.LARGE_BAM_FILE_SIZE)
        else:
            multiplier = int(self._get_file_size(file_path)) // int(
                self.LARGE_BAM_FILE_SIZE)

        print('setting number of windows multiplier to: {}'.format(multiplier))

        return multiplier

    def _timeout_handler(self, signum, frame):
        print('Signal handler called with signal', signum)
        raise ValueError('QualiMap takes too long')

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info.get('mode') not in ['single', 'multi']:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        run_error = False
        try:
            signal.signal(signal.SIGALRM, self._timeout_handler)
            signal.alarm(self.TIMEOUT)
            if run_info['mode'] == 'single':
                result = self.run_bamqc(params['input_ref'],
                                        run_info['input_info'])
            elif run_info['mode'] == 'multi':
                result = self.run_multi_sample_qc(params['input_ref'],
                                                  run_info['input_info'])
            signal.alarm(0)
        except Exception:
            run_error = True

            workdir = os.path.join(self.scratch_dir,
                                   'qualimap_' + str(int(time.time() * 10000)))
            os.makedirs(workdir)

            with open(os.path.join(workdir, 'qualimapReport.html'),
                      'w') as report:
                report.write('<html><body><p></p></body></html>')

            package_info = self.package_output_folder(
                workdir, 'QualiMap_report',
                'EMPTY HTML report directory for QualiMap BAM QC',
                'qualimapReport.html')

            result = {
                'qc_result_folder_path': workdir,
                'qc_result_zip_info': package_info,
                'shock_id': None
            }
            error_msg = 'Running QualiMap returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Generating simple report instead\n'
            print(error_msg)

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'],
                                        run_error, params['input_ref'])

        return result

    def create_report(self,
                      result,
                      output_workspace,
                      run_error=None,
                      input_ref=None):

        if run_error:
            objects_created = []
            info = self.get_obj_info(input_ref)
            obj_type = self.get_type_from_obj_info(info)
            if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'Alignment'
                })

            if obj_type in [
                    'KBaseRNASeq.RNASeqAlignmentSet',
                    'KBaseSets.ReadsAlignmentSet'
            ]:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'AlignmentSet'
                })
                reads_alignment_info = self.get_alignments_from_set(input_ref)
                for alignment in reads_alignment_info:
                    alignment_ref = alignment.get('ref')
                    objects_created.append({
                        'ref': alignment_ref,
                        'description': 'Alignment'
                    })

            report_info = self.kbr.create_extended_report({
                'message':
                ' ',
                'objects_created':
                objects_created,
                'report_object_name':
                'qualimap_report' + str(uuid.uuid4()),
                'workspace_name':
                output_workspace
            })
            result['report_name'] = report_info['name']
            result['report_ref'] = report_info['ref']
            return result

        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def get_gtf_file(self, input_ref, set_op=False):

        print('Start fetching GFF file from genome')

        if set_op:
            set_data = self.set_api.get_reads_alignment_set_v1({
                'ref':
                input_ref,
                'include_item_info':
                1
            })
            input_ref = set_data['data']['items'][0]['ref']

        obj_data = self.dfu.get_objects({"object_refs":
                                         [input_ref]})['data'][0]['data']

        genome_ref = obj_data.get('genome_id')

        if not genome_ref:
            raise ValueError(
                'Alignment is not associated with a Genome object')

        result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4()))
        os.makedirs(result_directory)

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'is_gtf': True,
            'target_dir': result_directory
        })['file_path']

        return genome_gtf_file

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        try:
            gtf_file = self.get_gtf_file(input_ref)
        except Exception:
            gtf_file = ''

        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat',
            'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        options.append('--java-mem-size={}'.format(
            self.JAVA_MEM_DEFAULT_SIZE))  # always use large mem
        multiplier = self._large_file(bam_file_path)
        if multiplier:
            window_size = multiplier * 400
            print(f'using larger window size: {window_size} and Java memory: '
                  f'{self.JAVA_MEM_DEFAULT_SIZE}')
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows

        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        try:
            gtf_file = self.get_gtf_file(input_ref, set_op=True)
        except Exception:
            gtf_file = ''
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-c', '-outdir', workdir,
            '-outformat', 'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        multiplier = self._large_file(input_file_path)
        if multiplier:
            window_size = multiplier * 400
            print(f'using larger window size: {window_size} and Java memory: '
                  f'{self.JAVA_MEM_DEFAULT_SIZE}')
            options.append(f'-nw {window_size}')  # increase size of windows
            options.append(f'--java-mem-size={self.JAVA_MEM_DEFAULT_SIZE}')

        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
                # remove spacing in label
                label = '_'.join(label.split(' '))
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        print('reads_alignment_info: {}'.format(reads_alignment_info))
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        print('Start generating: {}'.format(input_file_path))
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()

        with open(input_file_path, 'r') as f:
            print('Generated: {}'.format(input_file_path))
            print(f.read())

        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if exitCode == 0:
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        """ Simple utility for packaging a folder and saving to shock """
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
예제 #33
0
    def run_VariationAnalyzer(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "InputParams" -> structure: parameter
           "obj_name" of String, parameter "workspace_name" of String,
           parameter "fastq_ref" of String, parameter "map_qual" of Long,
           parameter "base_qual" of Long, parameter "min_cov" of Long,
           parameter "min_qual" of Long
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_VariationAnalyzer

        self.su.validate_params(params)

        logging.info("Downloading Fastq File")
        fastq_file = self.dfu._stage_input_file(params['fastq_ref'],
                                                "paired_end")

        logging.info("Downloading assembly file")
        genome_assembly = self.dfu.download_genome(
            params['genome_or_assembly_ref'])

        self.su.deinterleave(fastq_file['files']['fwd'], self.shared_folder)

        sample_name = "snippy_output"  #hardcoded to match with attribute mapping file

        snippy_output = self.shared_folder + "/" + sample_name

        cmd = self.su.build_snippy_command(genome_assembly['path'],
                                           snippy_output, self.shared_folder)

        self.su.run_snippy_command(cmd)

        params[
            'vcf_staging_file_path'] = self.shared_folder + "/" + sample_name + "/snps.vcf"

        self.vu.save_variation_from_vcf(params)

        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': {
                'objects_created': [],
                'text_message': params['fastq_ref']
            },
            'workspace_name': params['workspace_name']
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END run_VariationAnalyzer

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_VariationAnalyzer return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #34
0
class nmdc_mg_assembly:
    def __init__(self, callbaack_url, scratch, wdl='../../metaAssembly/'):
        self.callback_url = callbaack_url
        self.scratch = scratch
        self.special = special(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.report = KBaseReport(self.callback_url)
        self.wdl_base = wdl

    def validate_params(self, params):
        pass

    def fetch_reads_files(self, reads_upas):
        """
        From a list of reads UPAs, uses ReadsUtils to fetch the reads as files.
        Returns them as a dictionary from reads_upa -> filename
        """
        if reads_upas is None:
            raise ValueError("reads_upas must be a list of UPAs")
        if len(reads_upas) == 0:
            raise ValueError("reads_upas must contain at least one UPA")
        reads_info = self.ru.download_reads(({
            'read_libraries': reads_upas,
            'interleaved': 'true',
            'gzipped': None
        }))['files']
        file_set = dict()
        for reads in reads_info:
            file_set[reads] = reads_info[reads]['files']['fwd']
        return file_set

    def run_wdl(self, rf):
        print(os.getcwd())
        wdl_files = ['jgi_assembly.wdl']

        for f in wdl_files:
            src = self.wdl_base + f
            dst = './' + f
            shutil.copy(src, dst)
        ins = {
            "jgi_metaASM.input_file": [rf.replace(self.scratch, './')],
            "jgi_metaASM.rename_contig_prefix": "contig",
            "jgi_metaASM.outdir": "/out/"
        }
        input_file = os.path.join(self.scratch, 'inputs.json')
        with open(input_file, 'w') as f:
            f.write(json.dumps(ins))

        p = {'workflow': wdl_files[0], 'inputs': 'inputs.json'}

        res = self.special.wdl(p)
        print('wdl: ' + str(res))

    def upload_assembly(self, file_path, workspace_name, assembly_name):
        """
        From a list of file paths, uploads them to KBase, generates Assembly objects,
        then returns the generated UPAs.
        """
        if not file_path:
            raise ValueError("file_path must be defined")
        if not os.path.exists(file_path):
            raise ValueError(
                "The given assembly file '{}' does not exist".format(
                    file_path))
        if not workspace_name:
            raise ValueError("workspace_name must be defined")
        if not assembly_name:
            raise ValueError("assembly_name must be defined")

        assembly_upa = self.au.save_assembly_from_fasta({
            "file": {
                "path": file_path
            },
            "workspace_name":
            workspace_name,
            "assembly_name":
            assembly_name
        })
        return assembly_upa

    def _upload_pipeline_result(self,
                                pipeline_result,
                                workspace_name,
                                assembly_name,
                                filtered_reads_name=None,
                                cleaned_reads_name=None,
                                skip_rqcfilter=False,
                                input_reads=None):
        """
        This is very tricky and uploads (optionally!) a few things under different cases.
        1. Uploads assembly
            - this always happens after a successful run.
        2. Cleaned reads - passed RQCFilter / BFC / SeqTK
            - optional, if cleaned_reads_name isn't None
        3. Filtered reads - passed RQCFilter
            - optional, if filtered_reads_name isn't None AND skip_rqcfilter is False
        returns a dict of UPAs with the following keys:
        - assembly_upa - the assembly (always)
        - filtered_reads_upa - the RQCFiltered reads (optionally)
        - cleaned_reads_upa - the RQCFiltered -> BFC -> SeqTK cleaned reads (optional)
        """

        # upload the assembly
        uploaded_assy_upa = self.file_util.upload_assembly(
            pipeline_result["spades"]["contigs_file"], workspace_name,
            assembly_name)
        upload_result = {"assembly_upa": uploaded_assy_upa}
        # upload filtered reads if we didn't skip RQCFilter (otherwise it's just a copy)
        if filtered_reads_name and not skip_rqcfilter:
            # unzip the cleaned reads because ReadsUtils won't do it for us.
            decompressed_reads = os.path.join(self.output_dir,
                                              "filtered_reads.fastq")
            pigz_command = "{} -d -c {} > {}".format(
                PIGZ, pipeline_result["rqcfilter"]["filtered_fastq_file"],
                decompressed_reads)
            p = subprocess.Popen(pigz_command,
                                 cwd=self.scratch_dir,
                                 shell=True)
            exit_code = p.wait()
            if exit_code != 0:
                raise RuntimeError(
                    "Unable to decompress filtered reads for validation! Can't upload them, either!"
                )
            filtered_reads_upa = self.file_util.upload_reads(
                decompressed_reads, workspace_name, filtered_reads_name,
                input_reads)
            upload_result["filtered_reads_upa"] = filtered_reads_upa
        # upload the cleaned reads
        if cleaned_reads_name:
            # unzip the cleaned reads because ReadsUtils won't do it for us.
            decompressed_reads = os.path.join(self.output_dir,
                                              "cleaned_reads.fastq")
            pigz_command = "{} -d -c {} > {}".format(
                PIGZ, pipeline_result["seqtk"]["cleaned_reads"],
                decompressed_reads)
            p = subprocess.Popen(pigz_command,
                                 cwd=self.scratch_dir,
                                 shell=True)
            exit_code = p.wait()
            if exit_code != 0:
                raise RuntimeError(
                    "Unable to decompress cleaned reads for validation! Can't upload them, either!"
                )
            cleaned_reads_upa = self.file_util.upload_reads(
                decompressed_reads, workspace_name, cleaned_reads_name,
                input_reads)
            upload_result["cleaned_reads_upa"] = cleaned_reads_upa
        return upload_result

    def assemble(self, params):
        self.validate_params(params)
        workspace_name = params['workspace_name']
        assembly_name = params['output_assembly_name']

        # Stage Data
        files = self.fetch_reads_files([params["reads_upa"]])
        reads_files = list(files.values())

        # Run WDL
        self.run_wdl(reads_files[0])

        # Check if things ran
        mfile = os.path.join(self.scratch, 'meta.json')
        print(mfile)
        if not os.path.exists(mfile):
            raise OSError("Failed to run workflow")

        with open(mfile) as f:
            pipeline_output = json.loads(f.read())
        out = pipeline_output["calls"]["jgi_metaASM.create_agp"][0]["outputs"]
        print(out)

        # Generate Output Objects
        contigs_fn = out['outcontigs']
        upa = self.upload_assembly(contigs_fn, workspace_name, assembly_name)

        upload_kwargs = {}

        print("upload complete")

        # Do report
        report_info = self.report.create({
            'report': {
                'objects_created': [],
                'text_message': "Assemble metagenomic reads"
            },
            'workspace_name': workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
예제 #35
0
    def run_Gblocks(self, ctx, params):
        """
        Method for trimming MSAs of either DNA or PROTEIN sequences
        **
        **        input_type: MSA
        **        output_type: MSA
        :param params: instance of type "Gblocks_Params" (Gblocks Input
           Params) -> structure: parameter "workspace_name" of type
           "workspace_name" (** The workspace object refs are of form: ** ** 
           objects = ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "desc" of String, parameter "input_ref" of type
           "data_obj_ref", parameter "output_name" of type "data_obj_name",
           parameter "trim_level" of Long, parameter "min_seqs_for_conserved"
           of Long, parameter "min_seqs_for_flank" of Long, parameter
           "max_pos_contig_nonconserved" of Long, parameter "min_block_len"
           of Long, parameter "remove_mask_positions_flag" of Long
        :returns: instance of type "Gblocks_Output" (Gblocks Output) ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN run_Gblocks
        console = []
        invalid_msgs = []
        self.log(console,'Running run_Gblocks with params=')
        self.log(console, "\n"+pformat(params))
        report = ''
#        report = 'Running run_Gblocks with params='
#        report += "\n"+pformat(params)


        #### do some basic checks
        #
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'input_ref' not in params:
            raise ValueError('input_ref parameter is required')
        if 'output_name' not in params:
            raise ValueError('output_name parameter is required')


        #### Get the input_ref MSA object
        ##
        try:
            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            objects = ws.get_objects([{'ref': params['input_ref']}])
            data = objects[0]['data']
            info = objects[0]['info']
            input_name = info[1]
            input_type_name = info[2].split('.')[1].split('-')[0]

        except Exception as e:
            raise ValueError('Unable to fetch input_ref object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()

        if input_type_name == 'MSA':
            MSA_in = data
            row_order = []
            default_row_labels = dict()
            if 'row_order' in MSA_in.keys():
                row_order = MSA_in['row_order']
            else:
                row_order = sorted(MSA_in['alignment'].keys())

            if 'default_row_labels' in MSA_in.keys():
                default_row_labels = MSA_in['default_row_labels']
            else:
                for row_id in row_order:
                    default_row_labels[row_id] = row_id
            if len(row_order) < 2:
                self.log(invalid_msgs,"must have multiple records in MSA: "+params['input_ref'])

            # export features to FASTA file
            input_MSA_file_path = os.path.join(self.scratch, input_name+".fasta")
            self.log(console, 'writing fasta file: '+input_MSA_file_path)
            records = []
            for row_id in row_order:
                #self.log(console,"row_id: '"+row_id+"'")  # DEBUG
                #self.log(console,"alignment: '"+MSA_in['alignment'][row_id]+"'")  # DEBUG
            # using SeqIO makes multiline sequences.  (Gblocks doesn't care, but FastTree doesn't like multiline, and I don't care enough to change code)
                #record = SeqRecord(Seq(MSA_in['alignment'][row_id]), id=row_id, description=default_row_labels[row_id])
                #records.append(record)
            #SeqIO.write(records, input_MSA_file_path, "fasta")
                records.extend(['>'+row_id,
                                MSA_in['alignment'][row_id]
                               ])
            with open(input_MSA_file_path,'w',0) as input_MSA_file_handle:
                input_MSA_file_handle.write("\n".join(records)+"\n")


            # Determine whether nuc or protein sequences
            #
            NUC_MSA_pattern = re.compile("^[\.\-_ACGTUXNRYSWKMBDHVacgtuxnryswkmbdhv \t\n]+$")
            all_seqs_nuc = True
            for row_id in row_order:
                #self.log(console, row_id+": '"+MSA_in['alignment'][row_id]+"'")
                if NUC_MSA_pattern.match(MSA_in['alignment'][row_id]) == None:
                    all_seqs_nuc = False
                    break

        # Missing proper input_type
        #
        else:
            raise ValueError('Cannot yet handle input_ref type of: '+type_name)


        # DEBUG: check the MSA file contents
#        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA_LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA_LINE: '"+line+"'")


        # validate input data
        #
        N_seqs = 0
        L_first_seq = 0
        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
            for line in input_MSA_file_handle:
                if line.startswith('>'):
                    N_seqs += 1
                    continue
                if L_first_seq == 0:
                    for c in line:
                        if c != '-' and c != ' ' and c != "\n":
                            L_first_seq += 1
        # min_seqs_for_conserved
        if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0:
            if int(params['min_seqs_for_conserved']) < int(0.5*N_seqs)+1:
                self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be >= N/2+1 (N="+str(N_seqs)+", N/2+1="+str(int(0.5*N_seqs)+1)+")\n")
            if int(params['min_seqs_for_conserved']) > int(params['min_seqs_for_flank']):
                self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be <= Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+")\n")

        # min_seqs_for_flank
        if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0:
            if int(params['min_seqs_for_flank']) > N_seqs:
                self.log(invalid_msgs,"Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+") must be <= N (N="+str(N_seqs)+")\n")

        # max_pos_contig_nonconserved
        if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) != 0:
            if int(params['max_pos_contig_nonconserved']) < 0:
                self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be >= 0"+"\n")
            if int(params['max_pos_contig_nonconserved']) > L_first_seq or int(params['max_pos_contig_nonconserved']) >= 32000:
                self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n")

        # min_block_len
        if 'min_block_len' in params and params['min_block_len'] != None and int(params['min_block_len']) != 0:
            if int(params['min_block_len']) < 2:
                self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be >= 2"+"\n")
            if int(params['min_block_len']) > L_first_seq or int(params['min_block_len']) >= 32000:
                self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n")

        # trim_level
        if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0:
            if int(params['trim_level']) < 0 or int(params['trim_level']) > 2:
                self.log(invalid_msgs,"Trim Level ("+str(params['trim_level'])+") must be >= 0 and <= 2"+"\n")


        if len(invalid_msgs) > 0:

            # load the method provenance from the context object
            self.log(console,"SETTING PROVENANCE")  # DEBUG
            provenance = [{}]
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            # add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects'] = []
            provenance[0]['input_ws_objects'].append(params['input_ref'])
            provenance[0]['service'] = 'kb_gblocks'
            provenance[0]['method'] = 'run_Gblocks'

            # report
            report += "FAILURE\n\n"+"\n".join(invalid_msgs)+"\n"
            reportObj = {
                'objects_created':[],
                'text_message':report
                }

            reportName = 'gblocks_report_'+str(uuid.uuid4())
            report_obj_info = ws.save_objects({
#                'id':info[6],
                'workspace':params['workspace_name'],
                'objects':[
                    {
                        'type':'KBaseReport.Report',
                        'data':reportObj,
                        'name':reportName,
                        'meta':{},
                        'hidden':1,
                        'provenance':provenance
                    }
                ]
            })[0]


            self.log(console,"BUILDING RETURN OBJECT")
            returnVal = { 'report_name': reportName,
                          'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4])
#                          'output_ref': None
                          }
            self.log(console,"run_Gblocks DONE")
            return [returnVal]


        ### Construct the command
        #
        #  e.g.
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks
        #
        gblocks_cmd = [self.GBLOCKS_bin]

        # check for necessary files
        if not os.path.isfile(self.GBLOCKS_bin):
            raise ValueError("no such file '"+self.GBLOCKS_bin+"'")
        if not os.path.isfile(input_MSA_file_path):
            raise ValueError("no such file '"+input_MSA_file_path+"'")
        if not os.path.getsize(input_MSA_file_path) > 0:
            raise ValueError("empty file '"+input_MSA_file_path+"'")

        # DEBUG
#        with open(input_MSA_file_path,'r',0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA LINE: '"+line+"'")


        # set the output path
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)
        output_dir = os.path.join(self.scratch,'output.'+str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Gblocks names output blocks MSA by appending "-gb" to input file
        #output_GBLOCKS_file_path = os.path.join(output_dir, input_name+'-gb')
        output_GBLOCKS_file_path = input_MSA_file_path+'-gb'
        output_aln_file_path = output_GBLOCKS_file_path

        # Gblocks is interactive and only accepts args from pipe input
        #if 'arg' in params and params['arg'] != None and params['arg'] != 0:
        #    fasttree_cmd.append('-arg')
        #    fasttree_cmd.append(val)


        # Run GBLOCKS, capture output as it happens
        #
        self.log(console, 'RUNNING GBLOCKS:')
        self.log(console, '    '+' '.join(gblocks_cmd))
#        report += "\n"+'running GBLOCKS:'+"\n"
#        report += '    '+' '.join(gblocks_cmd)+"\n"

        # FastTree requires shell=True in order to see input data
        env = os.environ.copy()
        #joined_fasttree_cmd = ' '.join(fasttree_cmd)  # redirect out doesn't work with subprocess unless you join command first
        #p = subprocess.Popen([joined_fasttree_cmd], \
        p = subprocess.Popen(gblocks_cmd, \
                             cwd = self.scratch, \
                             stdin = subprocess.PIPE, \
                             stdout = subprocess.PIPE, \
                             stderr = subprocess.PIPE, \
                             shell = True, \
                             env = env)
#                             executable = '/bin/bash' )

        
        # write commands to process
        #
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks

        p.stdin.write("o"+"\n")  # open MSA file
        p.stdin.write(input_MSA_file_path+"\n")

        if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0:
            p.stdin.write("b"+"\n")
            if int(params['trim_level']) >= 1:
                self.log (console,"changing trim level")
                p.stdin.write("5"+"\n")  # set to "half"
                if int(params['trim_level']) == 2:
                    self.log (console,"changing trim level")
                    p.stdin.write("5"+"\n")  # set to "all"
                elif int(params['trim_level']) > 2:
                    raise ValueError ("trim_level ("+str(params['trim_level'])+") was not between 0-2")
                p.stdin.write("m"+"\n")

        # flank must precede conserved because it acts us upper bound for acceptable conserved values
        if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0:
            self.log (console,"changing min_seqs_for_flank")
            p.stdin.write("b"+"\n")
            p.stdin.write("2"+"\n")
            p.stdin.write(str(params['min_seqs_for_flank'])+"\n")
            p.stdin.write("m"+"\n")

        if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0:
            self.log (console,"changing min_seqs_for_conserved")
            p.stdin.write("b"+"\n")
            p.stdin.write("1"+"\n")
            p.stdin.write(str(params['min_seqs_for_conserved'])+"\n")
            p.stdin.write("m"+"\n")

        if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) > -1:
            self.log (console,"changing max_pos_contig_nonconserved")
            p.stdin.write("b"+"\n")
            p.stdin.write("3"+"\n")
            p.stdin.write(str(params['max_pos_contig_nonconserved'])+"\n")
            p.stdin.write("m"+"\n")

        if 'min_block_len' in params and params['min_block_len'] != None and params['min_block_len'] != 0:
            self.log (console,"changing min_block_len")
            p.stdin.write("b"+"\n")
            p.stdin.write("4"+"\n")
            p.stdin.write(str(params['min_block_len'])+"\n")
            p.stdin.write("m"+"\n")
        
        p.stdin.write("g"+"\n")  # get blocks
        p.stdin.write("q"+"\n")  # quit
        p.stdin.close()
        p.wait()


        # Read output
        #
        while True:
            line = p.stdout.readline()
            #line = p.stderr.readline()
            if not line: break
            self.log(console, line.replace('\n', ''))

        p.stdout.close()
        #p.stderr.close()
        p.wait()
        self.log(console, 'return code: ' + str(p.returncode))
#        if p.returncode != 0:
        if p.returncode != 1:
            raise ValueError('Error running GBLOCKS, return code: '+str(p.returncode) + 
                '\n\n'+ '\n'.join(console))

        # Check that GBLOCKS produced output
        #
        if not os.path.isfile(output_GBLOCKS_file_path):
            raise ValueError("failed to create GBLOCKS output: "+output_GBLOCKS_file_path)
        elif not os.path.getsize(output_GBLOCKS_file_path) > 0:
            raise ValueError("created empty file for GBLOCKS output: "+output_GBLOCKS_file_path)


        # load the method provenance from the context object
        #
        self.log(console,"SETTING PROVENANCE")  # DEBUG
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = []
        provenance[0]['input_ws_objects'].append(params['input_ref'])
        provenance[0]['service'] = 'kb_gblocks'
        provenance[0]['method'] = 'run_Gblocks'


        # reformat output to single-line FASTA MSA and check that output not empty (often happens when param combinations don't produce viable blocks
        #
        output_fasta_buf = []
        id_order = []
        this_id = None
        ids = dict()
        alignment = dict()
        L_alignment = 0;
        L_alignment_set = False
        with open(output_GBLOCKS_file_path,'r',0) as output_GBLOCKS_file_handle:
            for line in output_GBLOCKS_file_handle:
                line = line.rstrip()
                if line.startswith('>'):
                    this_id = line[1:]
                    output_fasta_buf.append ('>'+re.sub('\s','_',default_row_labels[this_id]))
                    id_order.append(this_id)
                    alignment[this_id] = ''
                    if L_alignment != 0 and not L_alignment_set:
                         L_alignment_set = True
                    continue
                output_fasta_buf.append (line)
                for c in line:
                    if c != ' ' and c != "\n":
                        alignment[this_id] += c
                        if not L_alignment_set:
                            L_alignment += 1
        if L_alignment == 0:
            self.log(invalid_msgs,"params produced no blocks.  Consider changing to less stringent values")
        else:
            if 'remove_mask_positions_flag' in params and params['remove_mask_positions_flag'] != None and params['remove_mask_positions_flag'] != '' and params['remove_mask_positions_flag'] == 1:
                self.log (console,"removing mask positions")
                mask = []
                new_alignment = dict()
                for i in range(0,L_alignment):
                    mask[i] = '+'
                    if alignment[id_order[0]][i] == '-' \
                        or alignment[id_order[0]][i] == 'X' \
                        or alignment[id_order[0]][i] == 'x':
                        mask[i] = '-'
                for row_id in id_order:
                    new_alignment[row_id] = ''
                    for i,c in enumerate(alignment[row_id]):
                         if mask[i] == '+':
                            new_alignment[row_id] += c
                alignment = new_alignment

            L_alignment = len(alignment[id_order[0]])

            # write fasta with tidied ids
            output_MSA_file_path = os.path.join(output_dir, params['output_name']+'.fasta');
            with open(output_MSA_file_path,'w',0) as output_MSA_file_handle:
                output_MSA_file_handle.write("\n".join(output_fasta_buf)+"\n")


        # Upload results
        #
        if len(invalid_msgs) == 0:
            self.log(console,"UPLOADING RESULTS")  # DEBUG

# Didn't write file
#            with open(output_MSA_file_path,'r',0) as output_MSA_file_handle:
#                output_MSA_buf = output_MSA_file_handle.read()
#            output_MSA_buf = output_MSA_buf.rstrip()
#            self.log(console,"\nMSA:\n"+output_MSA_buf+"\n")
        
            # Build output_MSA structure
            #   first extract old info from MSA (labels, ws_refs, etc.)
            #
            MSA_out = dict()
            for key in MSA_in.keys():
                 MSA_out[key] = MSA_in[key]

            # then replace with new info
            #
            MSA_out['alignment'] = alignment
            MSA_out['name'] = params['output_name']
            MSA_out['alignment_length'] = alignment_length = L_alignment
            MSA_name = params['output_name']
            MSA_description = ''
            if 'desc' in params and params['desc'] != None and params['desc'] != '':
                MSA_out['desc'] = MSA_description = params['desc']

            # Store MSA_out
            #
            new_obj_info = ws.save_objects({
                            'workspace': params['workspace_name'],
                            'objects':[{
                                    'type': 'KBaseTrees.MSA',
                                    'data': MSA_out,
                                    'name': params['output_name'],
                                    'meta': {},
                                    'provenance': provenance
                                }]
                        })[0]


            # create CLW formatted output file
            max_row_width = 60
            id_aln_gap_width = 1
            gap_chars = ''
            for sp_i in range(id_aln_gap_width):
                gap_chars += ' '
            # DNA
            if all_seqs_nuc:
                strong_groups = { 'AG': True,
                                  'CTU': True
                                  }
                weak_groups = None
            # PROTEINS
            else:
                strong_groups = { 'AST':  True,
                                  'EKNQ': True,
                                  'HKNQ': True,
                                  'DENQ': True,
                                  'HKQR': True,
                                  'ILMV': True,
                                  'FILM': True,
                                  'HY':   True,
                                  'FWY':  True
                                  }
                weak_groups = { 'ACS':    True,
                                'ATV':    True,
                                'AGS':    True,
                                'KNST':   True,
                                'APST':   True,
                                'DGNS':   True,
                                'DEKNQS': True,
                                'DEHKNQ': True,
                                'EHKNQR': True,
                                'FILMV':  True,
                                'FHY':    True
                                }
                
            clw_buf = []
            clw_buf.append ('CLUSTALW format of GBLOCKS trimmed MSA '+MSA_name+': '+MSA_description)
            clw_buf.append ('')

            long_id_len = 0
            aln_pos_by_id = dict()
            for row_id in row_order:
                aln_pos_by_id[row_id] = 0
                row_id_disp = default_row_labels[row_id]
                if long_id_len < len(row_id_disp):
                    long_id_len = len(row_id_disp)

            full_row_cnt = alignment_length // max_row_width
            if alignment_length % max_row_width == 0:
                full_row_cnt -= 1
            for chunk_i in range (full_row_cnt + 1):
                for row_id in row_order:
                    row_id_disp = re.sub('\s','_',default_row_labels[row_id])
                    for sp_i in range (long_id_len-len(row_id_disp)):
                        row_id_disp += ' '

                    aln_chunk_upper_bound = (chunk_i+1)*max_row_width
                    if aln_chunk_upper_bound > alignment_length:
                        aln_chunk_upper_bound = alignment_length
                    aln_chunk = alignment[row_id][chunk_i*max_row_width:aln_chunk_upper_bound]
                    for c in aln_chunk:
                        if c != '-':
                            aln_pos_by_id[row_id] += 1

                    clw_buf.append (row_id_disp+gap_chars+aln_chunk+' '+str(aln_pos_by_id[row_id]))

                # conservation line
                cons_line = ''
                for pos_i in range(chunk_i*max_row_width, aln_chunk_upper_bound):
                    col_chars = dict()
                    seq_cnt = 0
                    for row_id in row_order:
                        char = alignment[row_id][pos_i]
                        if char != '-':
                            seq_cnt += 1
                            col_chars[char] = True
                    if seq_cnt <= 1:
                        cons_char = ' '
                    elif len(col_chars.keys()) == 1:
                        cons_char = '*'
                    else:
                        strong = False
                        for strong_group in strong_groups.keys():
                            this_strong_group = True
                            for seen_char in col_chars.keys():
                                if seen_char not in strong_group:
                                    this_strong_group = False
                                    break
                            if this_strong_group:
                                strong = True
                                break
                        if not strong:
                            weak = False
                            if weak_groups != None:
                                for weak_group in weak_groups.keys():
                                    this_weak_group = True
                                    for seen_char in col_chars.keys():
                                        if seen_char not in weak_group:
                                            this_strong_group = False
                                            break
                                    if this_weak_group:
                                        weak = True
                        if strong:
                            cons_char = ':'
                        elif weak:
                            cons_char = '.'
                        else:
                            cons_char = ' '
                    cons_line += cons_char

                lead_space = ''
                for sp_i in range(long_id_len):
                    lead_space += ' '
                lead_space += gap_chars

                clw_buf.append(lead_space+cons_line)
                clw_buf.append('')

            # write clw to file
            clw_buf_str = "\n".join(clw_buf)+"\n"
            output_clw_file_path = os.path.join(output_dir, input_name+'-MSA.clw');
            with open (output_clw_file_path, "w", 0) as output_clw_file_handle:
                output_clw_file_handle.write(clw_buf_str)
            output_clw_file_handle.close()


            # upload GBLOCKS FASTA output to SHOCK for file_links
            dfu = DFUClient(self.callbackURL)
            try:
                output_upload_ret = dfu.file_to_shock({'file_path': output_aln_file_path,
# DEBUG
#                                                      'make_handle': 0,
#                                                      'pack': 'zip'})
                                                       'make_handle': 0})
            except:
                raise ValueError ('error loading aln_out file to shock')

            # upload GBLOCKS CLW output to SHOCK for file_links
            try:
                output_clw_upload_ret = dfu.file_to_shock({'file_path': output_clw_file_path,
# DEBUG
#                                                      'make_handle': 0,
#                                                      'pack': 'zip'})
                                                           'make_handle': 0})
            except:
                raise ValueError ('error loading clw_out file to shock')


            # make HTML reports
            #
            # HERE


            # build output report object
            #
            self.log(console,"BUILDING REPORT")  # DEBUG

            reportName = 'gblocks_report_'+str(uuid.uuid4())
            reportObj = {
                'objects_created':[{'ref':params['workspace_name']+'/'+params['output_name'],
                                    'description':'GBLOCKS MSA'}],
                #'message': '',
                'message': clw_buf_str,
                'direct_html': '',
                #'direct_html_link_index': 0,
                'file_links': [],
                'html_links': [],
                'workspace_name': params['workspace_name'],
                'report_object_name': reportName
                }
            reportObj['file_links'] = [{'shock_id': output_upload_ret['shock_id'],
                                        'name': params['output_name']+'-GBLOCKS.FASTA',
                                        'label': 'GBLOCKS-trimmed MSA FASTA'
                                        },
                                       {'shock_id': output_clw_upload_ret['shock_id'],
                                        'name': params['output_name']+'-GBLOCKS.CLW',
                                        'label': 'GBLOCKS-trimmed MSA CLUSTALW'
                                        }]

            # save report object
            #
            SERVICE_VER = 'release'
            reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER)
            #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
            report_info = reportClient.create_extended_report(reportObj)                                       

        else:  # len(invalid_msgs) > 0
            reportName = 'gblocks_report_'+str(uuid.uuid4())
            report += "FAILURE:\n\n"+"\n".join(invalid_msgs)+"\n"
            reportObj = {
                'objects_created':[],
                'text_message':report
                }

            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            report_obj_info = ws.save_objects({
                    #'id':info[6],
                    'workspace':params['workspace_name'],
                    'objects':[
                        {
                            'type':'KBaseReport.Report',
                            'data':reportObj,
                            'name':reportName,
                            'meta':{},
                            'hidden':1,
                            'provenance':provenance
                            }
                        ]
                    })[0]

            report_info = dict()
            report_info['name'] = report_obj_info[1]
            report_info['ref'] = str(report_obj_info[6])+'/'+str(report_obj_info[0])+'/'+str(report_obj_info[4])


        # done
        returnVal = { 'report_name': report_info['name'],
                      'report_ref': report_info['ref']
                      }

        self.log(console,"run_Gblocks DONE")
        #END run_Gblocks

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method run_Gblocks return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]