示例#1
0
def mock_gapi_fetch_sequence(params):
    logging.info('Mocking `gapi.fetch_sequence(%s)`' % str(params))

    upa = ref_leaf(params)
    fp = _glob_upa(FETCH_SEQUENCE_DIR, upa)

    # Download and cache
    if fp is None:
        logging.info('Calling in cache mode `gapi.fetch_sequence(%s)`' % str(params))

        gapi = GenericsAPI(os.environ['SDK_CALLBACK_URL'], service_ver='dev')
        fp_work = gapi.fetch_sequence(params)
        fp_cache = os.path.join(
            mkcache(FETCH_SEQUENCE_DIR),
            file_safe_ref(upa) + '.fa'
        )
        shutil.copyfile(
            fp_work,
            fp_cache
        )
        return fp_work

    # Pull from cache
    else:
        return fp
示例#2
0
 def __init__(self, config, scratch, callback_url ):
     self.scratch = config['scratch']
     self.ws_url = config['workspace-url']
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.dfu = DataFileUtil(self.callback_url)
     self.wsc = Workspace(self.ws_url)
     self.scratch = scratch
     self.callback_url = callback_url
     self.au = AssemblyUtil(self.callback_url)
     self.gapi = GenericsAPI(self.callback_url)
示例#3
0
def update_clients():
    callback_url = os.environ['SDK_CALLBACK_URL']
    Var.update(
        dfu=DataFileUtil(callback_url),
        kbr=KBaseReport(callback_url),
        fpu=FunctionalProfileUtil(callback_url, service_ver='dev'),
        gapi=GenericsAPI(callback_url, service_ver='dev'),
    )
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)

        # set up directory for files folder
        self.output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(self.output_dir)
        self.files_folder = os.path.join(self.output_dir, 'files')
        os.mkdir(self.files_folder)

        self.file_paths = []
        self.html_paths = []

        self.GenAPI = GenericsAPI(self.callback_url)
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.report_util = kb_GenericsReport(self.callback_url)
        self.generics_api = GenericsAPI(self.callback_url)
        self.ws_large_data = WsLargeDataIO(self.callback_url)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
示例#6
0
    def setUpClass(cls):
        token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_clustering'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_clustering',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(cls.wsURL)
        cls.serviceImpl = kb_clustering(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        suffix = int(time.time() * 1000)
        cls.wsName = "test_ContigFilter_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.wsId = ret[0]
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.gen_api = GenericsAPI(cls.callback_url, service_ver='dev')

        cls.prepare_data()
示例#7
0
    def run_picrust2_pipeline(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_picrust2_pipeline
        ####################################################################################################
        ####################################################################################################
        ####################################################################################################
        ####################################################################################################
        ####################################################################################################

        logging.info(params)

        #
        ##
        ### params, app-globals, directories, etc
        ####
        #####

        logging.info('BEGINNING KB_PICRUST2. params: %s' % str(params))

        params = Params(params)

        dprint('params', run=locals())

        reset_Var()  # clear all fields but `debug`

        Var.update(
            params=params,
            dfu=DataFileUtil(self.callback_url),
            kbr=KBaseReport(self.callback_url),
            fpu=FunctionalProfileUtil(self.callback_url, service_ver='beta'),
            gapi=GenericsAPI(self.callback_url),
            shared_folder=self.shared_folder,
            run_dir=os.path.join(self.shared_folder,
                                 'run_dir_picrust2_' + str(uuid.uuid4())),
            warnings=[],
            objects_created=[],
        )

        os.mkdir(Var.run_dir)  # for this API-method run

        Var.update(return_dir=os.path.join(Var.run_dir, 'return'), )

        os.mkdir(Var.return_dir)  # for return input/output/logs etc.

        if Var.debug:
            with open(os.path.join(Var.run_dir, '#params'), 'w') as fh:
                json.dump(params.params, fh)

        # TODO document `run_dir` structure

        #
        ##
        ### obj
        ####
        #####

        # instantiate

        amp_mat = AmpliconMatrix(params['amplicon_matrix_upa'])
        if 'row_attributemapping_ref' in amp_mat.obj:
            row_attrmap = AttributeMapping(
                amp_mat.obj['row_attributemapping_ref'], amp_mat)
        else:
            msg = (
                "Input AmpliconMatrix "
                "does not have a row AttributeMapping to assign PICRUSt2 functions to."
            )
            logging.warning(msg)
            Var.warnings.append(msg)

        # validate input data

        amp_mat.validate_amplicon_abundance_data()

        # generate input files

        seq_flpth = os.path.join(Var.return_dir, 'study_seqs.fna')
        seq_abundance_table_flpth = os.path.join(Var.return_dir,
                                                 'study_seqs.tsv')

        amp_mat.to_fasta(seq_flpth)
        amp_mat.to_seq_abundance_table(seq_abundance_table_flpth)

        # objs should be app globals
        Var.amp_mat = amp_mat

        #
        ##
        ### args
        ####
        #####

        # TODO get tee functionality working in run_check
        # to avoid extra cmd

        Var.out_dir = os.path.join(Var.return_dir, 'PICRUSt2_output')
        log_flpth = os.path.join(Var.return_dir, 'log.txt')
        p = 4

        cmd_pipeline = ' '.join([
            'set -o pipefail &&',
            'source activate picrust2 &&',
            'picrust2_pipeline.py',
            '-s',
            seq_flpth,
            '-i',
            seq_abundance_table_flpth,
            '-o',
            Var.out_dir,
            '--per_sequence_contrib',
            '-p',
            str(p),
            '|& tee',
            log_flpth,
        ])

        cmd_description = ' \\\n'.join([
            'cd %s &&' % Var.out_dir, 'source activate picrust2 &&',
            'add_descriptions.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -m EC',
            '                    -o EC_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz',
            '&&',
            'add_descriptions.py -i KO_metagenome_out/pred_metagenome_unstrat.tsv.gz -m KO',
            '                    -o KO_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz',
            '&&',
            'add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -m METACYC',
            '                    -o pathways_out/path_abun_unstrat_descrip.tsv.gz'
        ])

        get_cmd_func_l = lambda FUNC: [
            ('cd %s && ' % Var.out_dir + 'source activate picrust2 && '
             f'hsp.py -i {FUNC} -t out.tre -o {FUNC}_predicted.tsv.gz -p {p}'),
            ('cd %s && ' % Var.out_dir + 'source activate picrust2 && '
             'metagenome_pipeline.py '
             '-i ../%s ' % os.path.basename(seq_abundance_table_flpth) +
             '-m marker_predicted_and_nsti.tsv.gz '
             f'-f {FUNC}_predicted.tsv.gz '
             f'-o {FUNC}_metagenome_out')
        ] + ([] if FUNC == 'PHENO' else [  # no descriptions for IMG phenotype
            ('cd %s && ' % Var.out_dir + 'source activate picrust2 && '
             f'add_descriptions.py -i {FUNC}_metagenome_out/pred_metagenome_unstrat.tsv.gz -m {FUNC} '
             f'-o {FUNC}_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz'
             ),
        ])

        cmd_func_l = []
        for func in ['cog', 'pfam', 'tigrfam', 'pheno']:
            if params.getd(func) == 1:
                cmd_func_l.extend(get_cmd_func_l(func.upper()))

        #
        ##
        ### run
        ####
        #####

        run_check(cmd_pipeline)
        run_check(cmd_description)
        for cmd_func in cmd_func_l:
            run_check(cmd_func)

        #
        ##
        ### sanity checks
        ####
        #####

        if Var.debug:
            for func in Var.func_l:
                if not Var.params.getd(func):
                    continue

                fp0 = os.path.join(Var.out_dir,
                                   Var.func_2_cfg[func]['relfp'][0])
                fp1 = os.path.join(Var.out_dir,
                                   Var.func_2_cfg[func]['relfp'][1])

                # Check dropped amplicons are the unaligned/distant ones (debug)
                appfile.check_dropped_amplicon_ids(fp0, amp_mat)
                # Check no samples dropped (debug)
                appfile.check_dropped_sample_ids(fp1, amp_mat)

        #
        ##
        ### update/save Amplicon workflow objects
        ####
        #####

        path_abun_predictions_tsv_gz_flpth = os.path.join(
            Var.out_dir, 'pathways_out/path_abun_predictions.tsv.gz')

        attribute = 'MetaCyc Predictions'
        source = 'PICRUSt2'

        # if row AttributeMapping,
        # update that and referencing objs
        if amp_mat.row_attrmap_upa is not None:

            # update row AttributeMapping with traits
            id2attr = appfile.parse_picrust2_traits(
                path_abun_predictions_tsv_gz_flpth)
            ind, attribute = row_attrmap.add_attribute_slot(attribute, source)
            row_attrmap.map_update_attribute(ind, id2attr)
            row_attrmap_upa_new = row_attrmap.save()

            # update AmpliconMatrix which references row AttributeMapping
            amp_mat.obj['row_attributemapping_ref'] = row_attrmap_upa_new
            amp_mat_upa_new = amp_mat.save(name=params['output_name'])

            Var.objects_created.extend([
                {
                    'ref': row_attrmap_upa_new,
                    'description': 'Added attribute `%s`' % attribute,
                },
                {
                    'ref':
                    amp_mat_upa_new,
                    'description':
                    'Updated amplicon AttributeMapping reference to `%s`' %
                    row_attrmap_upa_new
                },
            ])

        #
        ##
        ### html report w/ heatmaps
        ####
        #####

        logging.info('Beginning report business')

        ##
        ## report

        Var.report_dir = os.path.join(Var.run_dir, 'report')

        report_html_flpth = report.HTMLReportWriter(
            [cmd_pipeline, cmd_description] + cmd_func_l, ).write()

        html_links = [{
            'path': Var.report_dir,
            'name': os.path.basename(report_html_flpth),
        }]

        #
        ##
        ### FunctionalProfile
        ####
        #####
        logging.info('Starting saving FunctionalProfiles if any')

        if Var.debug:
            FP_amp_mat_ref = params[
                'amplicon_matrix_upa']  # this makes mocking more flexible in case something makes a fake UPA
        else:
            FP_amp_mat_ref = amp_mat_upa_new  # this AmpliconMatrix is new one with new AttributeMapping

        # gunzip TSVs out to another directory
        tsv_dir = os.path.join(Var.run_dir, 'decompressed_tsv')
        os.mkdir(tsv_dir)

        for func in Var.func_l:
            if not Var.params.getd(func):
                continue

            func_name = Var.func_2_cfg[func]['name']

            if Var.params.getd('create_amplicon_fps'):
                id = 'amplicon_' + func
                desc = 'Amplicon %s abundance' % func_name

                fp_src = os.path.join(Var.out_dir,
                                      Var.func_2_cfg[func]['relfp'][0])
                fp_dst = os.path.join(tsv_dir, id + '.tsv')
                gunzip(fp_src, fp_dst)

                upa = Var.fpu.import_func_profile(
                    dict(
                        workspace_id=Var.params['workspace_id'],
                        func_profile_obj_name='%s.%s' %
                        (Var.params['output_name'], id),
                        original_matrix_ref=FP_amp_mat_ref,
                        profile_file_path=fp_dst,
                        profile_type='amplicon',
                        profile_category='organism',
                        data_epistemology='predicted',
                        epistemology_method='PICRUSt2',
                        description=desc,
                    ))['func_profile_ref']

                Var.objects_created.append(dict(ref=upa, description=desc))

            if Var.params.getd('create_sample_fps'):
                id = 'metagenome_' + func
                desc = 'Metagenome %s abundance' % func_name

                fp_src = os.path.join(Var.out_dir,
                                      Var.func_2_cfg[func]['relfp'][1])
                fp_dst = os.path.join(tsv_dir, id + '.tsv')
                gunzip(fp_src, fp_dst)

                upa = Var.fpu.import_func_profile(
                    dict(
                        workspace_id=Var.params['workspace_id'],
                        func_profile_obj_name='%s.%s' %
                        (Var.params['output_name'], id),
                        original_matrix_ref=FP_amp_mat_ref,
                        profile_file_path=fp_dst,
                        profile_type='mg',
                        profile_category='community',
                        data_epistemology='predicted',
                        epistemology_method='PICRUSt2',
                        description=desc,
                    ))['func_profile_ref']

                Var.objects_created.append(dict(ref=upa, description=desc))

        # look at TSVs
        dprint(
            'ls -lh %s/*' % tsv_dir,
            #'file -i %s/*/*' % tsv_dir,
            run='cli')

        #
        ##
        ### return files
        ####
        #####

        file_links = [{
            'path':
            Var.return_dir,
            'name':
            'PICRUSt2_results.zip',
            'description':
            'Input, output, cmd, intermediate files, log'
        }]

        params_report = {
            'warnings': Var.warnings,
            'objects_created': Var.objects_created,
            'file_links': file_links,
            'html_links': html_links,
            'direct_html_link_index': 0,
            'report_object_name': 'kb_PICRUSt2_report',
            'workspace_name': params['workspace_name'],
            'html_window_height': report.REPORT_HEIGHT,
        }

        Var.params_report = params_report

        obj = Var.kbr.create_extended_report(params_report)

        output = {
            'report_name': obj['name'],
            'report_ref': obj['ref'],
        }

        #END run_picrust2_pipeline

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_picrust2_pipeline return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#8
0
class VCFToVariation:
    def __init__(self, config, scratch, callback_url ):
        self.scratch = config['scratch']
        self.ws_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)
        self.scratch = scratch
        self.callback_url = callback_url
        self.au = AssemblyUtil(self.callback_url)
        self.gapi = GenericsAPI(self.callback_url)


    def _parse_vcf_data(self, params):
        vcf_filepath = self._stage_input(params)

        # file is validated by this point, can assume vcf_filepath is valid
        reader = vcf.Reader(open(vcf_filepath, 'r'))

        version = float(reader.metadata['fileformat'][4:6])
        genotypes = reader.samples
        chromosomes = []
        contigs = {}
        totalvars = 0

        for record in reader:
            totalvars += 1
            if record.CHROM not in chromosomes:
                chromosomes.append(record.CHROM)

            if record.CHROM not in contigs.keys():
                passvar = 1 if not record.FILTER else 0

                contigs[record.CHROM] = {
                    'contig_id': record.CHROM,
                    'totalvariants': 1,
                    'passvariants': passvar,
                    'length': int(record.affected_end-record.affected_start),
                }
            else:
                contigs[record.CHROM]['totalvariants'] += 1
                if not record.FILTER:
                    contigs[record.CHROM]['passvariants'] += 1

        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'file_ref': vcf_filepath
        }

        return vcf_info


    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []

        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes):
        chromos_not_in_assembly = []

        pp(assembly_chromosomes)

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _get_vcf_version(self, vcf_filepath):
        with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted.")
                raise ValueError("Invalid VCF.  ##fileformat line in meta is improperly formatted. "
                                 "Check VCF file specifications: https://samtools.github.io/hts-specs/")

            vcf_version = float(tokens[1][-4:].rstrip())

            return vcf_version

    def validate_vcf(self, params):
        if 'genome_or_assembly_ref' not in params:
            raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params)
        if 'vcf_staging_file_path' not in params:
            raise ValueError('VCF staging file path not in input parameters: \n\n' + params)


        vcf_filepath = self._stage_input(params)

        vcf_version = self._get_vcf_version(vcf_filepath)

        # setup directorys for validation output
        validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        # vcftools (vcf-validator) supports VCF v4.0-4.2
        # https://github.com/vcftools/vcftools

        # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3
        # https://github.com/EBIvariation/vcf-validator

        # vcftools is only to validate VCF v4.0

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-l")
            validator_cmd.append('error')
            print("VCF version "+str(vcf_version)+".")
        elif vcf_version >= 4.0:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version 4.0.")
        else:
            raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the '
                             'first line of vcf file and in appropriate syntax. Check VCF file specifications: '
                             'https://samtools.github.io/hts-specs/')

        print("Validator command: {}".format(validator_cmd))

        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if line.decode("utf-8").strip().startswith('[info]'):
                validator_output.append(line.decode("utf-8"))

        out, err = p.communicate()

        validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt')
        file_output_chk = []

        try:
            if validator_output[0][:6] == '[info]':
                # validation by vcf_validator_linux
                validation_output_filename = validator_output[1].split(' ')[6].strip('\n')
                vo = validator_output[2].split(' ')
                file_output_chk = ''.join(vo[9:]).strip('\n')

                if not os.path.exists(validation_output_filename):
                    raise ValueError(validation_output_filename+' does not exist!')

                if not file_output_chk == 'isvalid':
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))

                #TODO: more detailed validation parsing for vcf_validator_linux
            else:
                if validator_output:
                    with open(validation_output_filename, 'w') as f:
                        for line in validator_output:
                            f.write(str(line))
                        f.close()
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))
                else:
                    with open(validation_output_filename, 'w') as f:
                        f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0")
                        f.close()

                # TODO: more detailed validation parsing for vcftools
        except IndexError:
            # if vcf file < v4.1, and valid it will produce index error on line 132
            if validator_output:
                with open(validation_output_filename, 'w') as f:
                    for line in validator_output:
                        f.write(str(line))
                    f.close()
                print('\n'.join(validator_output))
                raise ValueError('\n'.join(validator_output))
            else:
                with open(validation_output_filename, 'w') as f:
                    f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0")
                    f.close()

        if not os.path.exists(validation_output_filename):
            print('Validator did not generate log file!')
            raise SystemError("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filename))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filename

    def _stage_input(self, params):
        # extract file location from input ui parameters
        if params['vcf_staging_file_path'].startswith('/kb/module/test/'):
            # variation utils unit test
            vcf_local_file_path = params['vcf_staging_file_path']

            if vcf_local_file_path.endswith('.gz'):
                with gzip.open(vcf_local_file_path, 'rb') as f_in:
                    with open(vcf_local_file_path[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                vcf_local_file_path = vcf_local_file_path[:-3]
        else:
            staging_dir = '/staging'
            vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path'])

        if not os.path.exists(vcf_local_file_path):
            raise OSError('VCF input path does not exist, or is not readable')

        orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path))
        print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}')
        self.original_file = shutil.copy(vcf_local_file_path, orig_file_path)

        # TODO: use data file utils here, upload vcf to shock, use dfu.
        if is_gz_file(vcf_local_file_path):
            # /staging is read only, therefore have to copy before uncompressing
            if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']):
                copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path']))
                unpack = self.dfu.unpack_file({'file_path': copy})
            else:
                unpack = {}
                unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path'])
            params['vcf_local_file_path'] = unpack['file_path']
            return unpack['file_path']
        else:
            params['vcf_local_file_path'] = vcf_local_file_path 
            return vcf_local_file_path

    def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file):
        """
        function for creating sample attribute mapping file.
        """
        try:
            with open (vcf_file, 'r') as vcf_handle:
                Lines = vcf_handle.readlines()

                for line in Lines:
                    if(line.startswith("#CHROM")):
                       header = line.lstrip().split("\t")

                       try:
                          with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle:
                              attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID")

                              for i in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[i])
                              #attribute_mapping_handle.write("\n")


                              attribute_mapping_handle.write("label\t\t\t")
                              for j in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[j])
                              #attribute_mapping_handle.write("\n")
                       except IOError:
                           print("Could not write to file:", sample_attribute_mapping_file)

        except IOError:
               print("Could not read file:", vcf_file)

    def _validate_assembly_ids(self, params):
        # All chromosome ids from the vcf should be in assembly
        # but not all assembly chromosome ids should be in vcf


        if ('genome_ref' in params):
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': params['genome_or_assembly_ref']
            }])

            self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref']

        if ('assembly_ref' in params):
            self.vcf_info['assembly_ref'] = params['assembly_ref']

        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])

        assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys()
        vcf_chromosomes = self.vcf_info['chromosome_ids']

        chk_assembly_ids =  self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.')


        return assembly_chromosomes

    def _validate_sample_ids(self, params):
        # All samples within the VCF file need to be in sample attribute list


        vcf_genotypes = self.vcf_info['genotype_ids']

        sample_ids_subset = self.wsc.get_object_subset([{
            'included': ['/instances'],
            'ref': params['sample_attribute_ref']
        }])

        sample_ids = sample_ids_subset[0]['data']['instances'].keys()

        validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids)

        if isinstance(validate_genotypes, list):
            failed_genos = ' '.join(validate_genotypes)
            print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')
            raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')

        return sample_ids

    def _construct_contig_info(self, params):
        """
            KBaseGwasData.Variations type spec

            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 passvariants - total number of variants that pass quality variation filter in contig
                 length - length of contig from assembly data
             */

             typdef structure {
               string contig_id;
               int totalvariants;
               int passvariants;
               int length; // from assembly
             } contig_info;
        """

        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])[0]['data']['contigs']


        contigs = []

        contig_infos = self.vcf_info['contigs']


        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs
   

    def _bgzip_vcf(self, vcf_filepath):

        if not os.path.exists(vcf_filepath):
           print (vcf_filepath + " does not exist")

        zip_cmd = ["bgzip", vcf_filepath]
        
        p = subprocess.Popen(zip_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()        
        
        bgzip_file_path = vcf_filepath + ".gz"
        print (bgzip_file_path)
          
        return bgzip_file_path
  
 
    def _index_vcf(self, bgzip_file):
 
        output_dir = self.scratch

        bgzip_filepath = os.path.join(self.scratch, bgzip_file)
        if not os.path.exists(bgzip_filepath):
           print (bgzip_filepath + " does not exist")

        index_cmd = ["tabix", "-p", "vcf", bgzip_filepath]       
        p = subprocess.Popen(index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
         
        index_file_path = bgzip_filepath + ".tbi"
     
        return index_file_path

    def _index_assembly(self, assembly_file):
        if not os.path.exists(assembly_file):
           print (assembly_file + " does not exist")

        logging.info("indexing assembly file")

        assembly_index_cmd = ["samtools", "faidx", assembly_file]
        print(assembly_index_cmd)
        p = subprocess.Popen(assembly_index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()

        logging.info("indexing of assembly file done!")

        return assembly_file + ".fai"

    def _download_assembly(self, assembly_ref):
        file = self.au.get_assembly_as_fasta({
          'ref': assembly_ref
        })
        return file
 
    def _construct_variation(self, params, contigs_info):
        
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref population; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :param population: previoiusly constructed sample population data
            :return: constructed variation object (dictionary)
        """

        if not self.vcf_info['file_ref'].startswith(self.scratch):
            new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref']))
            self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file)
      

        vcf_staged_file = self.original_file

        bgzip_file_path = self._bgzip_vcf(vcf_staged_file)
        vcf_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': bgzip_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)


        index_file_path = self._index_vcf(bgzip_file_path)
        vcf_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)


        assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path']

        assembly_index_file_path = self._index_assembly(assembly_file_path)
        assembly_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': assembly_index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref)
        
        variation_obj = {
            'numgenotypes': int(len(self.vcf_info['genotype_ids'])),
            'numvariants': int(self.vcf_info['total_variants']),
            'contigs': contigs_info,
            'population': params['sample_attribute_ref'],

            # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref
            'assemby_ref': self.vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle' : vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
            'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'],
            'assembly_index_handle': assembly_index_shock_file_ref['handle']
        }
        if 'genome_ref' in params:
            variation_obj['genome_ref'] =  params['genome_ref']

        return variation_obj

    def _save_var_obj(self, params, var):
        """
        :param params:
        :param var:
        :return:
            DataFileUtils object_info:
                objid - the numerical id of the object.
                name - the name of the object.
                type - the type of the object.
                save_date - the save date of the object.
                ver - the version of the object.
                saved_by - the user that saved or copied the object.
                wsid - the id of the workspace containing the object.
                workspace - the name of the workspace containing the object.
                chsum - the md5 checksum of the object.
                size - the size of the object in bytes.
                meta - arbitrary user-supplied metadata about the object.
        """

        print('Saving Variation to workspace...\n')

        if var:
            if not 'variation_object_name' in params:
                var_obj_name = 'variation_'+str(uuid.uuid4())
            else:
                var_obj_name = params['variation_object_name']

            var_obj_info = self.dfu.save_objects({
                'id': self.dfu.ws_name_to_id(params['workspace_name']),
                'objects': [{
                    'type': 'KBaseGwasData.Variations',
                    'data': var,
                    'name': var_obj_name
                }]
            })[0]

            return var_obj_info
        else:
            raise ValueError('Variation object blank, cannot not save to workspace!')

    def _validate_sample_attribute_ref(self, params):

        #params["sample_attribute_ref"] = ''  #just for testing
        if not params['sample_attribute_ref']:
           sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv")   #hardcoded for testing
           self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file)
          
           logging.info("Uploading sample attribute file to ref")
           vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock(
               {'file_path': sample_attribute_mapping_file, 'make_handle': 1}
           )
           shock_id = vcf_sample_attribute_shock_file_ref['shock_id']
           ws_id = self.dfu.ws_name_to_id(params['workspace_name'])
           import_params = {
                  'input_shock_id' : shock_id,
                  'output_ws_id': ws_id,
                  'output_obj_name': 'Sample_attribute'}

           ret = self.gapi.file_to_attribute_mapping(import_params)
           params['sample_attribute_ref'] = ret['attribute_mapping_ref']

    def import_vcf(self, params):
        # VCF validation
        # VCF file validation
        file_valid_result = self.validate_vcf(params)
        self._validate_sample_attribute_ref(params)
        # VCF file parsing
        self.vcf_info = self._parse_vcf_data(params)
        # Validate vcf chromosome ids against assembly chromosome ids
        self._validate_assembly_ids(params)
        # Validate vcf genotypes against sample meta data ids
        self._validate_sample_ids(params)

        # Variation object construction
        # construct contigs_info
        contigs_info = self._construct_contig_info(params)
        # construct variation
        var = self._construct_variation(params, contigs_info)

        # Save variation object to workspace
        var_wksp_obj = self._save_var_obj(params, var)

        return [var_wksp_obj, var]
示例#9
0
    def run_FAPROTAX(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FAPROTAX

        logging.info(params)

        Var.update({ # carry over into globals `Var`, regardless of resetting, for all API-method runs
            'params': Params(params),
            'shared_folder': self.shared_folder,
            'kbase_endpoint': self.config['kbase-endpoint'], # contains environment, for constructing Genome landing page url
            #---
            'ws': Workspace(self.workspace_url),
            'dfu': DataFileUtil(self.callback_url), # instantiate here so within runtime of @patch
            'kbr': KBaseReport(self.callback_url), # instantiate here so within runtime of @patch 
            'gapi': GenericsAPI(self.callback_url),
            'fpu': FunctionalProfileUtil(self.callback_url, service_ver='beta'), # TODO overhead?
            #---
            'warnings': [],
            #---
            'run_dir': os.path.join(self.shared_folder, 'kbfptx_' + str(uuid.uuid4())),
        })

        os.mkdir(Var.run_dir)
        Var.update({
            'return_dir': os.path.join(Var.run_dir, 'return'),
        })
        os.mkdir(Var.return_dir)

        #
        ##
        ### detect input type
        ####
        #####

        oi = Var.ws.get_object_info3(
            {'objects': [{
                'ref': params['input_upa']
            }]})['infos'][0]

        if oi[2].startswith('KBaseSearch.GenomeSet'):
            output = do_GenomeSet_workflow()

        elif oi[2].startswith('KBaseMatrices.AmpliconMatrix'):
            output = do_AmpliconMatrix_workflow()

        else:
            raise Exception('Unknown type `%s` for `input_upa`' % oi[2])

        #END run_FAPROTAX

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FAPROTAX return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
示例#10
0
    def run_classify(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_classify

        logging.info(params)

        params = Params(params)
        Var.params = params
        '''
        tmp/                                        `shared_folder`
        └── kb_rdp_clsf_<uuid>/                      `run_dir`
            ├── return/                             `return_dir`
            |   ├── cmd.txt
            |   ├── study_seqs.fna
            |   └── RDP_Classifier_output/          `out_dir`
            |       ├── out_allRank.tsv
            |       └── out_fixedRank.tsv
            └── report/                             `report_dir`
                ├── pie_hist.html
                ├── suburst.html
                └── report.html
        '''

        ##
        ## set up globals ds `Var` for this API-method run
        ## which involves making this API-method run's directory structure

        Var.update({
            'run_dir':
            os.path.join(self.shared_folder,
                         'kb_rdp_clsf_' + str(uuid.uuid4())),
            'dfu':
            DataFileUtil(self.callback_url),
            'ws':
            Workspace(self.workspace_url),
            'gapi':
            GenericsAPI(self.callback_url),
            'kbr':
            KBaseReport(self.callback_url),
            'warnings': [],
        })

        os.mkdir(Var.run_dir)

        Var.update({
            'return_dir': os.path.join(Var.run_dir, 'return'),
            'report_dir': os.path.join(Var.run_dir, 'report'),
        })

        os.mkdir(Var.return_dir)
        os.mkdir(Var.report_dir)

        Var.update(
            {'out_dir': os.path.join(Var.return_dir, 'RDP_Classifier_output')})

        os.mkdir(Var.out_dir)

        # cat and gunzip SILVA refdata
        # which has been split into ~99MB chunks to get onto Github
        #if params.is_custom():
        #    app_file.prep_refdata()

        #
        ##
        ### load objects
        ####
        #####

        amp_mat = AmpliconMatrix(params['amp_mat_upa'])
        row_attr_map_upa = amp_mat.obj.get('row_attributemapping_ref')

        create_row_attr_map = row_attr_map_upa is None
        row_attr_map = AttributeMapping(row_attr_map_upa, amp_mat=amp_mat)

        #
        ##
        ### cmd
        ####
        #####

        fasta_flpth = os.path.join(Var.return_dir, 'study_seqs.fna')
        Var.out_allRank_flpth = os.path.join(Var.out_dir, 'out_allRank.tsv')
        Var.out_shortSeq_flpth = os.path.join(
            Var.out_dir,
            'out_unclassifiedShortSeqs.txt')  # seqs too short to classify

        shutil.copyfile(amp_mat.get_fasta(), fasta_flpth)

        cmd = ('java -Xmx4g -jar %s classify %s ' %
               (Var.classifier_jar_flpth, fasta_flpth) +
               ' '.join(params.cli_args) + ' ' + '--format allRank ' +
               '--outputFile %s --shortseq_outfile %s' %
               (Var.out_allRank_flpth, Var.out_shortSeq_flpth))

        run_check(cmd)

        #
        ##
        ### extract classifications
        ####
        #####

        id2taxStr = app_file.get_fix_filtered_id2tax()

        # get ids of classified and unclassified seqs
        shortSeq_id_l = app_file.parse_shortSeq(
        )  # sequences too short to get clsf
        classified_id_l = list(id2taxStr.keys())

        # make sure classifieds and shorts complement
        if Var.debug:
            ret = sorted(classified_id_l + shortSeq_id_l)
            mat = sorted(amp_mat.obj['data']['row_ids'])
            assert ret == mat, \
                'diff1: %s, diff2: %s' % (set(ret)-set(mat), set(mat)-set(ret))

        if len(classified_id_l) == 0:
            raise Exception('No sequences were long enough to be classified')

        # add in id->'' for unclassified seqs
        # so id2taxStr_l is complete
        # so no KeyErrors later
        for shortSeq_id in shortSeq_id_l:
            id2taxStr[shortSeq_id] = ''

        # add to globals for testing
        Var.shortSeq_id_l = shortSeq_id_l

        #
        ##
        ### add to row AttributeMapping
        ####
        #####

        prose_args = params.get_prose_args()

        attribute = ('RDP Classifier Taxonomy (conf=%s, gene=%s)' %
                     (prose_args['conf'], prose_args['gene']))
        attribute_names = row_attr_map.get_attribute_names()
        if attribute in attribute_names:
            attribute = get_numbered_duplicate(attribute_names, attribute)

        source = 'RDP Classifier'

        ind, attribute = row_attr_map.add_attribute_slot(attribute, source)
        row_attr_map.update_attribute(ind, id2taxStr)

        #
        ##
        ### save obj
        ####
        #####

        amp_mat_output_name = Var.params['output_name']
        attr_map_output_name = (amp_mat_output_name + '.Amplicon_attributes'
                                if create_row_attr_map else None)

        row_attr_map_upa_new = row_attr_map.save(name=attr_map_output_name)

        amp_mat.obj['row_attributemapping_ref'] = row_attr_map_upa_new
        amp_mat_upa_new = amp_mat.save(amp_mat_output_name)

        objects_created = [
            dict(  # row AttrMap
                ref=row_attr_map_upa_new,
                description='%sAdded attribute `%s`' % (
                    'Created. ' if create_row_attr_map else '',
                    attribute,
                )),
            dict(  # AmpMat
                ref=amp_mat_upa_new,
                description=
                'Updated amplicon AttributeMapping reference to `%s`' %
                row_attr_map_upa_new),
        ]

        # testing
        if Var.debug:
            Var.update(dict(
                amp_mat=amp_mat,
                row_attr_map=row_attr_map,
            ))

        #
        ##
        ### html report
        ####
        #####

        hrw = report.HTMLReportWriter(cmd_l=[cmd])

        html_flpth = hrw.write()

        html_links = [{
            'path': Var.report_dir,
            'name': os.path.basename(html_flpth),
        }]

        #
        ##
        ###
        ####
        #####

        file_links = [{
            'path': Var.run_dir,
            'name': 'RDP_Classifier_results.zip',
            'description': 'Input, output'
        }]

        params_report = {
            'warnings': Var.warnings,
            'objects_created': objects_created,
            'html_links': html_links,
            'direct_html_link_index': 0,
            'file_links': file_links,
            'workspace_id': params['workspace_id'],
            'html_window_height': Var.report_height,
        }

        # testing
        Var.params_report = params_report

        report_obj = Var.kbr.create_extended_report(params_report)

        output = {
            'report_name': report_obj['name'],
            'report_ref': report_obj['ref'],
        }

        #END run_classify

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_classify return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class Subsetting_Matrices:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)

        # set up directory for files folder
        self.output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(self.output_dir)
        self.files_folder = os.path.join(self.output_dir, 'files')
        os.mkdir(self.files_folder)

        self.file_paths = []
        self.html_paths = []

        self.GenAPI = GenericsAPI(self.callback_url)

    def _get_df(self, params):
        """
        Get Amplicon Matrix Data then make Pandas.DataFrame(),
        also get taxonomy data and add it to df.
        """

        logging.info('Getting DataObject')

        # Amplicon data
        obj = self.dfu.get_objects(
            {'object_refs': [params.get('input_obj_ref')]})
        self._make_fasta(obj_ref=obj['data'][0]['data']['amplicon_set_ref'])
        amp_data = obj['data'][0]['data']

        row_ids = amp_data['data']['row_ids']
        col_ids = amp_data['data']['col_ids']
        values = amp_data['data']['values']
        # Add 'taxonomy' column
        col_ids.append('taxonomy')
        # Make pandas DataFrame
        df = pd.DataFrame(index=row_ids, columns=col_ids)
        for i in range(len(row_ids)):
            df.iloc[i, :-1] = values[i]

        # Get object
        test_row_attributes_permanent_id = obj['data'][0]['data'][
            'row_attributemapping_ref']
        obj = self.dfu.get_objects(
            {'object_refs': [test_row_attributes_permanent_id]})
        tax_dict = obj['data'][0]['data']['instances']

        # Add taxonomy data and transpose matrix
        for row_indx in df.index:
            df.loc[row_indx]['taxonomy'] = tax_dict[row_indx][0]
        return df

    def _get_mdf(self, params):
        """
        Get metadata object and make pd.DataFrame from with samples as index and specified subsetting column
        """

        logging.info('Getting MetadataObject')

        subsetting_field = params.get('subset_field')
        subsetting_field = subsetting_field['meta_group'][0]
        params['subset_field'] = subsetting_field
        # Get object
        obj = self.dfu.get_objects(
            {'object_refs': [params.get('attribute_mapping_obj_ref')]})
        meta_dict = obj['data'][0]['data']['instances']
        attr_l = obj['data'][0]['data']['attributes']

        # Find index of specified category name
        indx = 0
        for i in range(len(attr_l)):
            if attr_l[i]['attribute'] == subsetting_field:
                indx = i
                break
        # Set metadata_samples
        metadata_samples = meta_dict.keys()
        # Make pandas DataFrame
        mdf = pd.DataFrame(index=metadata_samples, columns=[subsetting_field])
        i = 0
        for key, val in meta_dict.items():
            mdf.iloc[i] = val[indx]
            i += 1
        return mdf

    def insert_newlines(self, string, every):
        return '\n'.join(string[i:i + every]
                         for i in range(0, len(string), every))

    def _make_fasta(self, obj_ref):

        logging.info(
            'Making fasta file from AmpliconSet obj: {}'.format(obj_ref))

        set_obj = self.dfu.get_objects({'object_refs': [obj_ref]})
        OTUs = set_obj['data'][0]['data']['amplicons'].keys()
        with open(os.path.join(self.files_folder, "amp_set.fa"),
                  'w') as fa_file:

            logging.info('Writing to amp_set.fa file')

            for key in OTUs:
                con_str = '>' + key + '\n'
                con_str += self.insert_newlines(
                    set_obj['data'][0]['data']['amplicons'][key]
                    ['consensus_sequence'], 60)
                con_str += '\n'
                fa_file.write(con_str)

    def _make_group_dict(self, mdf, subset_field):
        """
        Make dictionary with a subsetting column value as key and samples of that subsetting column value and values
        """

        logging.info('Making grouping dictionary')

        group_dict = {}
        for sample, group in zip(mdf.index, mdf[subset_field]):
            try:
                group_dict[group].append(sample)
            except KeyError:
                group_dict.update({group: [sample]})

        for group, sample_list in group_dict.items():
            group_dict[group].append('taxonomy')

        return group_dict

    def _create_subset_matrices(self, df, mdf, subset_field):
        """
        create dictionary of subset pd.DataFrames
        """

        logging.info('Creating matrices...')

        group_dict = self._make_group_dict(mdf=mdf, subset_field=subset_field)

        # Create dict of sub matrices
        dict_of_sub_matrices = {}
        for key, val in group_dict.items():
            data = df[val]
            dict_of_sub_matrices.update({key: data})

        # Drop rows that have all zero counts
        for key, matrix in dict_of_sub_matrices.items():
            to_drop = []
            for indx in matrix.index:
                if all(val == 0 for val in matrix.loc[indx][0:-1]):
                    to_drop.append(indx)
            dict_of_sub_matrices[key] = matrix.drop(to_drop)

        return dict_of_sub_matrices

    def _save_matrices(self, matrices):
        """
        takes a dictionary of pd.matrices and saves the matrices as tab sep csv's, with the name being keys
        """

        logging.info('Saving matrices: {}'.format(matrices.keys()))

        for group, matrix in matrices.items():
            name = group + '.csv'
            matrix.to_csv(os.path.join(self.files_folder, name), sep='\t')

    def _create_html_report(self):
        """
        Create html report of files in zip by walking through output folder
        """

        logging.info('Creating html report..')

        html_str = '<html>'
        html_str += '<h3>Files In Output Zip File:</h3>\n'
        for root, folders, files in os.walk(self.output_dir):
            # Find the image files by their extensions.
            for f in files:
                if re.match('^[a-zA-Z]+.*.(fa|csv)$',
                            f):  # jpeg|jpg|bmp|png|tiff|pdf|ps|
                    html_str += '<p>' + f + '</p>\n'
        html_str += '</html>'

        with open(os.path.join(self.files_folder, "index.html"),
                  'w') as index_file:
            index_file.write(html_str)

        # have needed files saved to folder before shock
        shock = self.dfu.file_to_shock({
            'file_path': self.files_folder,
            'make_handle': 0,
            'pack': 'zip'
        })
        # list that goes to 'html_links'
        self.html_paths.append({
            'shock_id': shock['shock_id'],
            'name': 'index.html',
            'label': 'Report',
            'description': "files in zip"
        })
        # list that goes to 'file_pahts'
        self.file_paths.append(os.path.join(self.files_folder, 'files.zip'))

    def _call_and_create_objects(self, params):

        logging.info('_call_and_create_objects method')

        list_of_matrix_files = []
        groups = []
        for root, folders, files in os.walk(self.files_folder):

            logging.info('Finding files..')

            # Find the image files by their extensions.
            for f in files:
                if re.match('^[a-zA-Z]+.*.(fa)$', f):
                    fa_file = os.path.join(root, f)
                if re.match('^[a-zA-Z]+.*.(csv)$', f):
                    groups.append(f[0:-4])
                    list_of_matrix_files.append(os.path.join(root, f))

        for csv_file_path, group_name in zip(list_of_matrix_files, groups):

            logging.info('Sending data to importer:\n'
                         'csv_file_path: {}\n'
                         'group_name: {}\n'
                         'fa_file: {}'.format(csv_file_path, group_name,
                                              fa_file))

            params['obj_type'] = 'AmpliconMatrix'
            params['matrix_name'] = group_name
            params['tsv_fasta'] = {
                'tsv_file_tsv_fasta':
                csv_file_path,
                'fasta_file_tsv_fasta':
                fa_file,
                'metadata_keys_tsv_fasta':
                'taxonomy_id, taxonomy, taxonomy_source, consensus_sequence'
            }
            params['scale'] = 'raw'
            params['description'] = 'dsc'
            params['amplicon_set_name'] = group_name + '-set'
            params['sample_set_ref'] = params.get('attribute_mapping_obj_ref')
            params['input_local_file'] = True

            logging.info('Sending params: {}'.format(
                json.dumps(params, indent=1)))

            obj_run = self.GenAPI.import_matrix_from_biom(params=params)
            logging.info('Object run: {}'.format(obj_run))

    def _create_amp(self):

        amp_structure = {
            'data': [{
                'data': {
                    'amplicon_set_ref': '',
                    'col_attributemapping_ref': '',
                    'col_mapping': {},
                    'data': {},
                    'row_attributemapping_ref': '',
                    'row_mapping': {},
                    'scale': 'raw'
                },
                'info': [],
                'path': [''],
                'provenance': [],
                'creator': '',
                'orig_wsid': 0000,
                'created': '',
                'epoch': 0000,
                'refs': [],
                'copy_source_inaccessible': 0,
                'extracted_ids': {}
            }]
        }

    def run(self, params):

        logging.info('--->\nrunning Amp_Subset_Util with input \n' +
                     'params:\n{}'.format(json.dumps(params, indent=1)))

        df = self._get_df(params)
        mdf = self._get_mdf(params)
        matrices = self._create_subset_matrices(
            df=df, mdf=mdf, subset_field=params.get('subset_field'))
        self._save_matrices(matrices)
        self._create_html_report()
        self._call_and_create_objects(params)

        return {'file_paths': self.file_paths, 'html_paths': self.html_paths}
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.genapi = GenericsAPI(self.callback_url)
class ImportAttributeMappingUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.genapi = GenericsAPI(self.callback_url)

    def import_attribute_mapping_from_staging(self, params):
        """
          import_attribute_mapping_from_staging: wrapper method for
                                    fba_tools.tsv_file_to_attribute_mapping

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          attribute_mapping_name: output conditionSet object name
          workspace_name: workspace name/ID of the object

          return:
          obj_ref: return object reference
        """

        log('--->\nrunning ImportConditionSetUtil.import_attribute_mapping_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_attribute_mapping_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        ws_id = params['workspace_id']

        import_attribute_mapping_params = {
            'output_obj_name': params['attribute_mapping_name'],
            'output_ws_id': ws_id,
            'input_file_path': scratch_file_path
        }

        ref = self.genapi.file_to_fbamodel_attribute_mapping(
            import_attribute_mapping_params)

        returnVal = {'obj_ref': ref.get('attribute_mapping_ref')}

        return returnVal

    @staticmethod
    def validate_import_attribute_mapping_from_staging_params(params):
        """
        validate_import_attribute_mapping_from_staging_params:
                    validates params passed to import_attribute_mapping_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_id',
                'attribute_mapping_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_attribute_mapping_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        """
        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "FBAModelSet Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(
            params.get('staging_file_subdir_path'))
        report_params = {
            'message':
            upload_message,
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported FBAModelSet'
            }],
            'workspace_id':
            params['workspace_id'],
            'report_object_name':
            'import_model_attri_mapping_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output