예제 #1
0
class genelistutil:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        pass

    def download_genelist(self, genelistref, genesetfile):
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        with open(genesetfile, 'w') as filehandle:
            for item in geneset['element_ordering']:
                filehandle.write('%s\n' % item)

    def listToString(self, s):
        str1 = ""
        for ele in s:
            str1 += ele
        return str1

    def get_genomeid_from_featuresetid(self, genelistref):
        genome = {}
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']

        for k, v in geneset['elements'].items():
            genome[self.listToString(v)] = 1
        if (len(genome) != 1):
            exit("source of genome is not unique\n")
        else:
            return (list(genome.keys())[0])
예제 #2
0
    def setUpClass(cls):
        token = os.environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'provenance': [
                            {'service': 'GenomeFileUtil',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('GenomeFileUtil'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = GenomeFileUtil(cls.cfg)
        gff_path = "data/fasta_gff/RefSeq/Bacterial_Data/NC_021490.gff.gz"
        fasta_path = "data/fasta_gff/RefSeq/Bacterial_Data/NC_021490.fasta.gz"
        ws_obj_name = 'fungal_model'
        suffix = int(time.time() * 1000)
        cls.wsName = "test_GenomeFileUtil_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': cls.wsName})

        print('Uploading GFF file')
        result = cls.serviceImpl.fasta_gff_to_genome(
            cls.ctx,
            {
                'workspace_name': cls.wsName,
                'genome_name': 'MyGenome',
                'fasta_file': {'path': fasta_path},
                'gff_file': {'path': gff_path},
                'source': 'GFF',
                'type': 'Reference'
            })[0]
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        cls.genome_orig = data_file_cli.get_objects(
            {'object_refs': [result['genome_ref']]})['data'][0]['data']

        print('testing GFF download by building the file')
        down_result = cls.serviceImpl.genome_to_gff(
            cls.ctx, {'genome_ref': result['genome_ref']})[0]

        print('Reuploading GFF file')
        new_result = cls.serviceImpl.fasta_gff_to_genome(
            cls.ctx,
            {
                'workspace_name': cls.wsName,
                'genome_name': 'MyGenome',
                'fasta_file': {'path': fasta_path},
                'gff_file': {'path': down_result['file_path']},
                'source': 'GFF',
                'type': 'Reference'
            })[0]
        cls.genome_new = data_file_cli.get_objects({'object_refs': [new_result['genome_ref']]})['data'][0]['data']
    def setUpClass(cls):
        token = os.environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'provenance': [
                            {'service': 'GenomeFileUtil',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('GenomeFileUtil'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = GenomeFileUtil(cls.cfg)
        # gbk_path = "data/Cyanidioschyzon/Cyanidioschyzon_merolae.ASM9120v1.30.gbff"
        gbk_path = "data/Cyanidioschyzon/Cyanidioschyzon_merolae_one_locus.gbff"
        ws_obj_name = 'Cyanidioschyzon_merolae_duplicate_test_orig'
        suffix = int(time.time() * 1000)
        cls.wsName = "test_GenomeFileUtil_" + str(suffix)
        cls.wsClient.create_workspace({'workspace': cls.wsName})
        result = cls.serviceImpl.genbank_to_genome(cls.ctx, {
            'file': {'path': gbk_path},
            'workspace_name': cls.wsName,
            'genome_name': ws_obj_name,
            'generate_ids_if_needed': 1,
            'taxon_id': '511145',
            'source': "Ensembl user"
        })[0]

        data_file_cli = DataFileUtil(
            os.environ['SDK_CALLBACK_URL'],
            token=cls.ctx['token'],
            service_ver='dev'
        )
        cls.genome_orig = data_file_cli.get_objects({'object_refs': [result['genome_ref']]})['data'][0]['data']
        print('testing Genbank download by building the file')
        cls.serviceImpl.export_genome_as_genbank(cls.ctx, {
            'input_ref': result['genome_ref']})
        new_gbk_path = "/kb/module/work/tmp/Cyanidioschyzon_merolae_duplicate_test_orig/KBase_derived_Cyanidioschyzon_merolae_duplicate_test_orig.gbff"
        new_ws_obj_name = 'Cyanidioschyzon_merolae_duplicate_test_new'
        new_result = cls.serviceImpl.genbank_to_genome(cls.ctx, {
            'file': {'path': new_gbk_path},
            'workspace_name': cls.wsName,
            'genome_name': new_ws_obj_name,
            'generate_ids_if_needed': 1,
            'taxon_id': '511145',
            'source': "Ensembl user"
        })[0]
        cls.genome_new = data_file_cli.get_objects({'object_refs': [new_result['genome_ref']]})['data'][0]['data']
예제 #4
0
파일: FileUtil.py 프로젝트: n1mus/CoverM
def load_fastas(config, scratch: str, upa: str):
    '''
    Returns list of (fasta_path, upa)
    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    mgu = MetagenomeUtils(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0]
    obj_type = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        # file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]
    elif "KBaseSets.AssemblySet" in obj_type:
        fasta_paths = []
        for item_upa in obj_data['data']['items']:
            faf = au.get_assembly_as_fasta({"ref": item_upa['ref']})
            fasta_paths.append((faf['path'], item_upa['ref']))
        return fasta_paths
    elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
        fasta_paths = []
        bin_file_dir = mgu.binned_contigs_to_file({
            'input_ref': upa,
            'save_to_shock': 0
        })['bin_file_directory']
        for (dirpath, dirnames, filenames) in os.walk(bin_file_dir):
            for fasta_file in filenames:
                fasta_path = os.path.join(scratch, fasta_file)
                fasta_path = os.path.splitext(fasta_path)[0] + ".fa"
                copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path)
                # Should I verify that the bins have contigs?
                # is it possible to have empty bins?
                fasta_paths.append((fasta_path, upa))
            break
        return fasta_paths
    else:
        raise Error('Input genome/metagenome reference has unhandled type')

    fasta_paths = []
    for genome_upa in upas:
        genome_data = ws.get_objects2({'objects': [{
            "ref": genome_upa
        }]})['data'][0]['data']
        assembly_upa = genome_upa + ';' + str(
            genome_data.get('contigset_ref')
            or genome_data.get('assembly_ref'))
        faf = au.get_assembly_as_fasta({'ref': assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
예제 #5
0
def mock_dfu_get_objects(params):
    logging.info('Mocking `dfu.get_objects(%s)`' % params)

    upa = ref_leaf(params['object_refs'][0])
    fp = _glob_upa(GET_OBJECTS_DIR, upa)

    # Download and cache
    if fp is None:
        logging.info('Calling in cache mode `dfu.get_objects`')

        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        obj = dfu.get_objects(params)
        fp = os.path.join(
            mkcache(GET_OBJECTS_DIR),
            file_safe_ref(upa) + TRANSFORM_NAME_SEP +
            obj['data'][0]['info'][1] + '.json')
        with open(fp, 'w') as fh:
            json.dump(obj, fh)
        return obj

    # Pull from cache
    else:
        with open(fp) as fh:
            obj = json.load(fh)
        return obj
class geneminerutils:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        #self.hr = htmlreportutils()
        #self.config = config
        #self.params = params

    def download_genelist(self, genelistref):
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        #geneset_query = ",".join(geneset)
        return (geneset['element_ordering'])
      #  #with open(genesetfile, 'w') as filehandle:
      #      #for item in geneset['element_ordering']:
      #       #   filehandle.write('%s\n' % item)
      #  #return (genesetfile)

    def generate_query(self, genomenetmine_dyn_url, genelistref, species, pheno):
        #pheno = ["disease"]
        #species = "potatoknet"
        #genes = ["PGSC0003DMG400006345", "PGSC0003DMG400012792", "PGSC0003DMG400033029", "PGSC0003DMG400016390",
        #         "PGSC0003DMG400039594", "PGSC0003DMG400028153"]
        #genomenetmine_dyn_url = 'http://ec2-18-236-212-118.us-west-2.compute.amazonaws.com:5000/networkquery/api'
        genes = self.download_genelist(genelistref)
        gsp = genescoreparser()
        x = gsp.summary(genomenetmine_dyn_url, genes, species, pheno)
        return (x)

    def get_evidence(self,genomenetmine_dyn_url, genelistref, species, pheno ):
        genes = self.download_genelist(genelistref)
        ep = evidenceparser()
        x = ep.summary(genomenetmine_dyn_url, genes, species, pheno)
        return (x)
예제 #7
0
 def test_gff_and_metagenome_to_metagenome(self):
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     result = self.serviceImpl.ws_obj_gff_to_metagenome(
         self.ctx, {
             'workspace_name': self.wsName,
             'genome_name': 'MyGenome',
             'gff_file': {
                 'path': self.gff_path
             },
             'ws_ref': self.metagenome_ref,
             'source': 'GFF',
             'type': 'Reference',
             'genome_type': 'Metagenome',
             'generate_missing_genes': True,
         })[0]
     self.assertTrue('metagenome_ref' in result)
     ret = dfu.get_objects({'object_refs':
                            [result['metagenome_ref']]})['data'][0]
     metagenome = ret['data']
     info = ret['info']
     # type_checks
     self.assertTrue(
         'KBaseMetagenomes.AnnotatedMetagenomeAssembly' in info[2])
     # make sure its same as original
     self._compare_features(self.genome_orig, metagenome)
예제 #8
0
    def check_CDS_warnings(self, result, test_name):
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                                     service_ver='dev')
        genome = data_file_cli.get_objects(
            {'object_refs': [result['genome_ref']]})['data'][0]['data']
        print("IN TEST NAME : " + str(test_name))
        cds_warning_count = 0
        cds_with_warning_count = 0
        if 'cdss' in genome:
            total_cds_count = len(genome['cdss'])
            for feature in genome['cdss']:
                if 'warnings' in feature:
                    if test_name == "test_jgi_bacterial_fasta_gff2_to_genome":
                        print(
                            str(feature['id']) + " warnings:" +
                            str(feature['warnings']))
                        print("Location: " + str(feature['location']))
                        print("Translation: " + feature['protein_translation'])
                        print("DNA Sequence: " + feature["dna_sequence"])
                    cds_with_warning_count = cds_with_warning_count + 1
                    cds_warning_count = cds_warning_count + len(
                        feature['warnings'])

            print("Total CDS: " + str(total_cds_count))
            print("CDS Warning Count: " + str(cds_warning_count))
            print("CDSs with a warning Count: " + str(cds_with_warning_count))
            print("Percent CDS with warning: " +
                  str((cds_with_warning_count / float(total_cds_count)) * 100))
예제 #9
0
 def print_genome_warnings(self, result):
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                                  service_ver='dev')
     genome = data_file_cli.get_objects(
         {'object_refs': [result['genome_ref']]})['data'][0]['data']
     if 'warnings' in genome:
         print("Genome warnings:" + str(genome['warnings']))
예제 #10
0
def load_fastas(config, scratch, upa):
    '''

    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs":[upa]})['data'][0]
    obj_type  = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]

    fasta_paths = []
    for genome_upa in upas:
        if upa != genome_upa:
            genome_upa = upa + ';' + genome_upa
        genome_data = ws.get_objects2( {'objects':[{"ref":genome_upa}]})['data'][0]['data']
        target_upa = genome_data.get('contigset_ref') or genome_data.get('assembly_ref')
        assembly_upa = genome_upa + ';' + target_upa
        faf = au.get_assembly_as_fasta({"ref":assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
예제 #11
0
 def test_no_sequence_kept(self):
     # features, cds, and non_coding_features should not have sequences in it.
     print("test_no_sequence_kept")
     gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff"
     ws_obj_name = 'no_sequence'
     result = self.getImpl().genbank_to_genome(
         self.getContext(),
         {
           'file': {
               'path': gbk_path},
           'workspace_name': self.getWsName(),
           'genome_name': ws_obj_name,
           'generate_ids_if_needed': 1
         })[0]
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], 
                                 token=self.__class__.token,
                                 service_ver='dev')
     genome = data_file_cli.get_objects({'object_refs': [result['genome_ref']]})['data'][0]['data']
     count_features_with_dna_sequence = 0
     for feature in genome['features']:
         if "dna_sequence" in feature:
             count_features_with_dna_sequence += 1
     count_non_coding_features_with_sequence = 0
     for feature in genome['non_coding_features']:
         if "dna_sequence" in feature:
             count_non_coding_features_with_sequence += 1
     count_cdss_with_sequence = 0
     for feature in genome['cdss']:
         if "dna_sequence" in feature:
             count_cdss_with_sequence += 1        
     self.assertTrue(count_features_with_dna_sequence == 0,"All features should not have DNA sequences.")
     self.assertTrue(count_non_coding_features_with_sequence == 0,
                     "All non_coding_features should not have DNA sequences.")
     self.assertTrue(count_cdss_with_sequence == 0,"All CDSs should not have DNA sequences.")
    def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        dfu = DataFileUtil(self.callback_url)

        bu = BackgroundUtils()
        TU = TestUtils()
        if params['TESTFLAG'] and params['background']:
            targetpath = '/kb/module/work/tmp/testgenome.fa'
            TU.GetGenome(targetpath)
            bu.BuildBackground(targetpath)
        elif params['background']:

            ws = Workspace('https://appdev.kbase.us/services/ws')
            subset = ws.get_object_subset([{
                                         'included':['/features/[*]/location', '/features/[*]/id','/assembly_ref'],
    'ref':params['genome_ref']}])
            aref = subset[0]['data']['assembly_ref']
            assembly_ref = {'ref': aref}
            print('Downloading Assembly data as a Fasta file.')
            assemblyUtil = AssemblyUtil(self.callback_url)
            fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)['path']
            bu.BuildBackground(fasta_file)


        get_objects_params = {'object_refs' : [params['SequenceSetRef']]}

        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']
        outFile = open(params['fasta_outpath'],'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()

        fu=FastaUtils()
        if params['mask_repeats']:
            fu.RemoveRepeats(params['fasta_outpath'],params['fasta_outpath'])

        output = {'fasta_outpath' : params['fasta_outpath']}
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #13
0
    def mock_dfu_get_objects(params):
        logging.info('Mocking dfu.get_objects(%s)' % params)

        upa = ref_leaf(params['object_refs'][0])
        fp = _glob_upa(GET_OBJECTS_DIR, upa)

        # Download and cache
        if fp is None:
            logging.info('Calling in cache mode `dfu.get_objects`')

            dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
            obj = dfu.get_objects(params)
            fp = os.path.join(
                mkcache(GET_OBJECTS_DIR),
                file_safe_ref(upa) + '__' + obj['data'][0]['info'][1] +
                '.json')
            with open(fp, 'w') as fh:
                json.dump(obj, fh)

        # Pull from cache
        else:
            with open(fp) as fh:
                obj = json.load(fh)

        # swap in arg obj
        if replace_obj is not None:
            ref = params['object_refs'][0]
            if ref in replace_obj:
                obj['data'][0]['data'] = replace_obj[ref]

        return obj
    def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        dfu = DataFileUtil(self.callback_url)
        get_objects_params = {'object_refs': [params['SequenceSetRef']]}
        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']

        outFile = open(params['fasta_outpath'], 'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()
        output = {'fasta_outpath': params['fasta_outpath']}
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #15
0
 def _get_assembly(self, genome):
     if 'assembly_ref' in genome:
         assembly_ref = genome['assembly_ref']
     else:
         assembly_ref = genome['contigset_ref']
     log('Assembly reference = ' + assembly_ref)
     log('Downloading assembly')
     dfu = DataFileUtil(self.cfg.callbackURL)
     log('object_refs:' + self.genome_ref + ";" + assembly_ref)
     assembly_data = dfu.get_objects(
         {'object_refs':
          [self.genome_ref + ";" + assembly_ref]})['data'][0]['data']
     if isinstance(assembly_data['contigs'], dict):  # is an assembly
         circular_contigs = set([
             x['contig_id'] for x in list(assembly_data['contigs'].values())
             if x.get('is_circ')
         ])
     else:  # is a contig set
         circular_contigs = set([
             x['id'] for x in assembly_data['contigs']
             if x.get('replicon_geometry') == 'circular'
         ])
     au = AssemblyUtil(self.cfg.callbackURL)
     assembly_file_path = au.get_assembly_as_fasta(
         {'ref': self.genome_ref + ";" + assembly_ref})['path']
     return assembly_file_path, circular_contigs
예제 #16
0
class VariationToVCF:
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

    def is_gz_file(filepath):
        with open(filepath, 'rb') as test_f:
            return binascii.hexlify(test_f.read(2)) == b'1f8b'

    def export_as_vcf(self, params):
        if 'input_var_ref' not in params:
            raise ValueError('Cannot export Variation- no input_var_ref field defined.')

        file = self.variation_to_vcf({'variation_ref': params['input_var_ref']})

        export_dir = os.path.join(self.scratch, file['variation_name'])
        os.makedirs(export_dir)

        try:
            shutil.move(file['path'], os.path.join(export_dir, os.path.basename(file['path'])))
        except shutil.Error as e:
            exit(e)

        dfupkg = self.dfu.package_for_download({
             'file_path': export_dir,
             'ws_refs': [params['input_var_ref']]
        })

        return {'shock_id': dfupkg['shock_id']}

    def variation_to_vcf(self, params):
        self.validate_params(params)

        print('downloading ws object data: '+params["variation_ref"])

        variation_obj = self.dfu.get_objects({'object_refs': [params['variation_ref']]})['data'][0]
        ws_type = variation_obj['info'][2]
        obj_name = variation_obj['info'][1]

        if 'KBaseGwasData.Variations' in ws_type:
            dl_path = self.process_vcf(self.scratch, variation_obj['data'])
        else:
            raise ValueError('Cannot write data to VCF; invalid WS type (' + ws_type +
                             ').  Supported types is KBaseGwasData.Variations')

        return {'path': dl_path, 'variation_name': obj_name}

    def process_vcf(self, output_vcf_file_path, data):
        obj = self.dfu.shock_to_file({
            'handle_id': data['vcf_handle_ref'],
            'file_path': output_vcf_file_path,
        })

        return obj['file_path']

    def validate_params(self, params):
        for key in ['variation_ref']:
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')
class SampleServiceUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.srv_wiz_url = config['srv-wiz-url']
        self.sample_url = config.get('kbase-endpoint') + '/sampleservice'
        self.dfu = DataFileUtil(self.callback_url)
        self.sample_ser = SampleService(self.sample_url)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)

    def get_sample_service_url(self):
        return self.sample_url

    def get_sample(self, sample_id, version=None):

        sample_url = self.get_sample_service_url()
        headers = {"Authorization": self.token}
        params = {"id": sample_id, "version": version}
        payload = {
            "method": "SampleService.get_sample",
            "id": str(uuid.uuid4()),
            "params": [params],
            "version": "1.1"
        }
        resp = requests.post(url=sample_url,
                             headers=headers,
                             data=json.dumps(payload))
        resp_json = resp.json()
        if resp_json.get('error'):
            raise RuntimeError(
                f"Error from SampleService - {resp_json['error']}")
        sample = resp_json['result'][0]

        # sample = self.sample_ser.get_sample(params)[0]

        return sample

    def get_ids_from_samples(self, sample_set_ref):
        logging.info('start retrieving sample ids from sample set')

        sample_set = self.dfu.get_objects({"object_refs": [sample_set_ref]
                                           })['data'][0]['data']

        samples = sample_set['samples']

        data_ids = []
        for sample in samples:
            sample_id = sample.get('id')
            version = sample.get('version')

            sample_data = self.get_sample(sample_id, version=version)

            data_id = sample_data['name']
            data_ids.append(data_id)

        return data_ids
예제 #18
0
 def DownloadMotifSet(self, refList, callback):
     MotifSetDict = {}
     dfu = DataFileUtil(callback)
     for ref in refList:
         get_objects_params = {'object_refs': [ref]}
         MotifSet = dfu.get_objects(get_objects_params)['data'][0]['data']
         MotifSetDict[ref] = deepcopy(MotifSet)
     return MotifSetDict
예제 #19
0
class genelistutil:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        pass

    def download_genelist(self, genelistref, genesetfile):
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        with open(genesetfile, 'w') as filehandle:
            for item in geneset['element_ordering']:
                filehandle.write('%s\n' % item)

    def listToString(self, s):

        # initialize an empty string
        str1 = ""

        # traverse in the string
        for ele in s:
            str1 += ele

        # return string
        return str1

    # function to get unique values
    def unique(self, list1):
        # insert the list to the set
        list_set = set(list1)
        # convert the set to the list
        unique_list = (list(list_set))
        for x in unique_list:
            print(x)

    def get_genomeid_from_featuresetid(self, genelistref):
        genome = {}
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        print(type(geneset['elements'].values()))
        for k, v in geneset['elements'].items():
            print(self.listToString(v))
            genome[self.listToString(v)] = 1
        if (len(genome) != 1):
            exit("source of genome is not unique\n")
        else:
            return (list(genome.values())[0])
예제 #20
0
def DownloadMotifSet(refList,callback):
    MotifSetDict = {}
    #init DFU
    dfu = DataFileUtil(callback)
    for ref in refList:
        get_objects_params = {'object_refs' : [ref]}
        #get_ss_params = {'object_refs' : [params['SS_ref']]}
        MotifSet = dfu.get_objects(get_objects_params)['data'][0]['data']
        MotifSetDict[ref] = deepcopy(MotifSet)
    return MotifSetDict
    def _fetch_df_from_refs(self, object_refs):
        dfu = DataFileUtil(self.callback_url)
        tables = dfu.get_objects({'object_refs': object_refs})['data']
        lambda_df = self._fetch_df_from_json(tables[0])
        stoich_df = self._fetch_df_from_json(tables[1])

        df = stoich_df.merge(lambda_df["lambda_O2"],
                             left_index=True,
                             right_index=True)
        return df
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'provenance': [{
             'service': 'GenomeFileUtil',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = GenomeFileUtil(cls.cfg)
     gbk_path = "data/Arabidopsis_gbff/A_thaliana_Ensembl_TAIR10_38_chr4_minus_xref.gbff"
     ws_obj_name = 'Yeast_chromosome1'
     suffix = int(time.time() * 1000)
     cls.wsName = "test_GenomeFileUtil_" + str(suffix)
     ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
     result = cls.serviceImpl.genbank_to_genome(
         cls.ctx, {
             'file': {
                 'path': gbk_path
             },
             'workspace_name': cls.wsName,
             'genome_name': ws_obj_name,
             'generate_ids_if_needed': 1,
             'source': "Ensembl"
         })[0]
     #        print("HERE IS THE RESULT:")
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                                  token=cls.ctx['token'],
                                  service_ver='dev')
     cls.genome = data_file_cli.get_objects(
         {'object_refs': [result['genome_ref']]})['data'][0]['data']
     json.dump(
         cls.genome,
         open(cls.cfg['scratch'] + "/relationship_test_genome.json", 'w'))
     cls.gene_ids = set((x['id'] for x in cls.genome['features']))
     cls.nc_feat_ids = set(
         (x['id'] for x in cls.genome['non_coding_features']))
     cls.mrna_ids = set((x['id'] for x in cls.genome['mrnas']))
     cls.cds_ids = set((x['id'] for x in cls.genome['cdss']))
예제 #23
0
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'provenance': [{
             'service': 'GenomeFileUtil',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     config_file = os.environ['KB_DEPLOYMENT_CONFIG']
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = GenomeFileUtil(cls.cfg)
     gff_path = "data/e_coli/NC_000913.3.gff3"
     fna_path = "data/e_coli/NC_000913.3.fasta"
     # fna_path = "data/e_coli/GCF_000005845.2_ASM584v2.fasta"
     ws_obj_name = 'ecoli_contigs'
     suffix = int(time.time() * 1000)
     cls.wsName = "test_GenomeFileUtil_" + str(suffix)
     cls.wsClient.create_workspace({'workspace': cls.wsName})
     result = cls.serviceImpl.fasta_gff_to_genome(
         cls.ctx, {
             'gff_file': {
                 'path': gff_path
             },
             'fasta_file': {
                 'path': fna_path
             },
             'taxon_id': 511145,
             'workspace_name': cls.wsName,
             'genome_name': ws_obj_name,
             'generate_missing_genes': 1,
             'generate_ids_if_needed': 1
         })[0]
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                                  token=cls.ctx['token'],
                                  service_ver='dev')
     dfu_result = data_file_cli.get_objects(
         {'object_refs': [result['genome_ref']]})
     cls.genome = dfu_result['data'][0]['data']
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'provenance': [{
             'service': 'GenomeFileUtil',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = GenomeFileUtil(cls.cfg)
     gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff"
     ws_obj_name = 'ecoli_genome'
     suffix = int(time.time() * 1000)
     cls.wsName = "test_GenomeFileUtil_" + str(suffix)
     ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
     result = cls.serviceImpl.genbank_to_genome(
         cls.ctx, {
             'file': {
                 'path': gbk_path
             },
             'workspace_name': cls.wsName,
             'genome_name': ws_obj_name,
             'generate_ids_if_needed': 1,
             'source': "RefSeq Reference"
         })[0]
     #        print("HERE IS THE RESULT:")
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                                  token=cls.ctx['token'],
                                  service_ver='dev')
     genome = data_file_cli.get_objects(
         {'object_refs': [result['genome_ref']]})['data'][0]['data']
     cls.assembly_ref = genome["assembly_ref"]
 def test_gff_and_metagenome_to_metagenome(self):
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     result = self.serviceImpl.ws_obj_gff_to_metagenome(self.ctx, {
         'workspace_name': self.wsName,
         'genome_name': 'MyGenome',
         'gff_file': {'path': self.gff_path},
         'ws_ref': self.metagenome_ref,
         'source': 'GFF',
         'type': 'Reference',
         'genome_type': 'Metagenome',
         'is_metagenome': True,
         'generate_missing_genes': True,
         'taxon_id': '3702',
     })[0]
     metagenome = dfu.get_objects({'object_refs': [result['metagenome_ref']]})['data'][0]['data']
     # make sure its same as original
     self._compare_features(self.genome_orig, metagenome)
    def check_CDS_warnings(self, result, test_name):
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        genome = data_file_cli.get_objects(
            {'object_refs': [result['genome_ref']]})['data'][0]['data']
        print("IN TEST NAME : " + str(test_name))
        cds_warning_count = 0
        cds_with_warning_count = 0
        if 'cdss' in genome:
            total_cds_count = len(genome['cdss'])
            for feature in genome['cdss']:
                if 'warnings' in feature:
                    cds_with_warning_count = cds_with_warning_count + 1
                    cds_warning_count = cds_warning_count + len(
                        feature['warnings'])

            print("Total CDS: " + str(total_cds_count))
            print("CDS Warning Count: " + str(cds_warning_count))
            print("CDSs with a warning Count: " + str(cds_with_warning_count))
            print("Percent CDS with warning: " +
                  str((cds_with_warning_count / float(total_cds_count)) * 100))
예제 #27
0
def get_upa_name(ws_url, cb_url, upa, is_test):
    '''
    '''
    if is_test:
        return "test_object"

    ws = Workspace(ws_url)
    objs = ws.get_object_info3({'objects': [{'ref': upa}]})
    upa_names = [info[1] for info in objs['infos']]
    if len(upa_names) > 0:
        return upa_names[0]

    dfu = DataFileUtil(cb_url)
    objs = dfu.get_objects({'object_refs': [upa]})['data']
    upa_names = [obj['info'][1] for obj in objs]
    if len(upa_names) > 0:
        return upa_names[0]
    else:
        raise ValueError("Could not find name of workspace object with id %s" %
                         upa)
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'provenance': [
                         {'service': 'GenomeFileUtil',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = GenomeFileUtil(cls.cfg)
     gbk_path = "data/Arabidopsis_gbff/Arab_Chloro_Modified.gbff"
     ws_obj_name = 'ArabidopsisChloro'
     suffix = int(time.time() * 1000)
     cls.wsName = "test_GenomeFileUtil_" + str(suffix)
     ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
     result = cls.serviceImpl.genbank_to_genome(
         cls.ctx,
         {
           'file': {
               'path': gbk_path},
           'workspace_name': cls.wsName,
           'genome_name': ws_obj_name,
           'generate_ids_if_needed': 1,
           'source': "RefSeq Latest"
         })[0]
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                             token=cls.ctx['token'],
                             service_ver='dev')
     cls.genome = data_file_cli.get_objects({'object_refs': [result['genome_ref']]})['data'][0]['data']
     json.dump(cls.genome, open(cls.cfg['scratch']+"/test_genome.json", 'w'))
예제 #29
0
 def test_genome_gff_to_genome(self):
     result = self.serviceImpl.ws_obj_gff_to_genome(
         self.ctx, {
             'workspace_name': self.wsName,
             'genome_name': 'MyGenome',
             'gff_file': {
                 'path': self.gff_path
             },
             'ws_ref': self.genome_ref,
             'source': 'GFF',
             'type': 'Reference',
             'taxon_id': '243276'
         })[0]
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     genome = dfu.get_objects({"object_refs":
                               [result['genome_ref']]})['data'][0]['data']
     self.feature_list_comparison(self.genome_orig, genome, 'features')
     self.feature_list_comparison(self.genome_orig, genome, 'cdss')
     self.feature_list_comparison(self.genome_orig, genome, 'mrnas')
     self.feature_list_comparison(self.genome_orig, genome,
                                  'non_coding_features')
 def test_same_genome(self):
     gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff"
     ws_obj_name = 'ecoli_genome'
     existing_assembly_ref = genome = self.__class__.assembly_ref
     result = self.getImpl().genbank_to_genome(
         self.getContext(), {
             'file': {
                 'path': gbk_path
             },
             'workspace_name': self.getWsName(),
             'genome_name': ws_obj_name,
             'generate_ids_if_needed': 1,
             'generate_missing_genes': 1,
             'source': 'refseq reference',
             'use_existing_assembly': existing_assembly_ref
         })[0]
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     genome = data_file_cli.get_objects(
         {'object_refs': [result['genome_ref']]})['data'][0]['data']
     self.assertTrue(genome['assembly_ref'] == existing_assembly_ref,
                     "Same file did not keep the same assembly ref")
예제 #31
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(sorted(feat['aliases'].keys())),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
예제 #32
0
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
                filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                         filtered_expression_matrix_suffix,
                                                         expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                    filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data,
                          'name': filtered_expression_matrix_name}
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref]

        save_object_params = {
            'id': workspace_id,
            'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = [x.strip() for x in label_string.split(',')]
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({'ref': genome_ref,
                                           'limit': len(ids),
                                           'structured_query': {"$or": [{"feature_id": x}
                                                                        for x in ids]},
                                           'sort_by': [['feature_id', True]]})['features']

        features_ids = set((feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets', []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']}
            )['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [x for x in base_set['element_ordering']
                                                        if x not in new_feature_set['elements']]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [x for x in genome_refs if x not in
                                                                 new_feature_set['elements'][
                                                                     element]]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format(
                    base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format(
                    new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [x.strip() for x in label_string.split(',')]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'), "",
                                                diff_expr_matrix_ref, filtered_em_name)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}

    def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}