예제 #1
0
def make_halo(search_distances, scan_distances, ratios=None):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR,
                               'Halobacterium sp', 64091)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    if ratios is not None:
        nw_factories.append(microbes_online.get_network_factory(
            mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5))

    keggorg = util.make_dfile_map(keggfile, 1, 3)['hal']
    rsat_organism = rsatdb.get_rsat_organism(keggorg)
    rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091)
    gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
    return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories,
                       search_distances, scan_distances, True, None)
예제 #2
0
def make_halo(ratio_matrix, search_distances, scan_distances):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, CACHE_DIR)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(
            stringdb.get_network_factory2('hal',
                                          stringfile,
                                          0.5,
                                          normalized=True))
    else:
        logging.warn("no STRING file specified !")

    nw_factories.append(
        microbes_online.get_network_factory(
            mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5))

    org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                     org.make_rsat_organism_mapper(rsatdb),
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db, nw_factories)

    return org_factory.create('hal', search_distances, scan_distances)
예제 #3
0
def make_halo(search_distances, scan_distances):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR        )
    mo_db = microbes_online.MicrobesOnline()

    org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                     org.make_rsat_organism_mapper(rsatdb),
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db, [])

    return org_factory.create('hal', search_distances, scan_distances)
예제 #4
0
def make_halo(search_distances, scan_distances):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, CACHE_DIR)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)

    org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                     org.make_rsat_organism_mapper(rsatdb),
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db, [])

    return org_factory.create('hal', search_distances, scan_distances)
예제 #5
0
 def __get_kegg_data(self):
     # determine the NCBI code
     organism_code = self['organism_code']
     if os.path.exists(USER_KEGG_FILE_PATH):
         keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
     elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
         keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
     else:
         raise Exception('KEGG file not found !!')
     kegg_map = util.make_dfile_map(keggfile, 1, 3)
     kegg2ncbi = util.make_dfile_map(keggfile, 1, 2)
     if self['ncbi_code'] is None and organism_code in kegg2ncbi:
         self['ncbi_code'] = kegg2ncbi[organism_code]
     return self['ncbi_code'], kegg_map[organism_code]
예제 #6
0
 def __make_organism(self):
     """makes a mock organism with almost real data"""
     features = {}
     dfile = util.read_dfile('testdata/Halobacterium_sp_features',
                             comment='--')
     for line in dfile.lines:
         features[line[0]] = st.Feature(
             line[0], line[1], line[2],
             st.Location(line[3], int(line[4]), int(line[5]),
                         line[6] == 'R'))
     tfile = util.read_dfile('testdata/Halobacterium_sp_feature_names',
                             comment='--')
     synonyms = th.create_from_rsat_feature_names(tfile)
     return MockOrganismWithSynonyms('64091', features, synonyms)
예제 #7
0
 def __make_organism(self):
     """makes a mock organism with almost real data"""
     features = {}
     dfile = util.read_dfile('testdata/Halobacterium_sp_features',
                             comment='--')
     for line in dfile.lines:
         features[line[0]] = st.Feature(line[0], line[1], line[2],
                                        st.Location(line[3],
                                                    int(line[4]),
                                                    int(line[5]),
                                                    line[6] == 'R'))
     tfile = util.read_dfile(
         'testdata/Halobacterium_sp_feature_names', comment='--')
     synonyms = th.create_from_rsat_feature_names(tfile)
     return MockOrganismWithSynonyms('64091', features, synonyms)
    def test_motif_scoring(self):
        """tests the motif scoring in integration"""
        search_distances = {'upstream': (-20, 150)}
        scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True, quote='\"')
        ratio_matrix = matrix_factory.create_from(infile)
        organism = testutil.make_halo(search_distances, scan_distances, ratio_matrix)
        membership = FakeMembership()
        config_params = {'memb.min_cluster_rows_allowed': 3,
                         'memb.max_cluster_rows_allowed': 70,
                         'multiprocessing': False,
                         'num_clusters': 1,
                         'output_dir': 'out',
                         'debug': {},
                         'search_distances': {'upstream': (-20, 150)},
                         'num_iterations': 2000,
                         'MEME': {'schedule': lambda i: True,
                                  'version': '4.3.0',
                                  'global_background': False,
                                  'arg_mod': 'zoops',
                                  'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                                  'use_revcomp': 'True', 'max_width': 24, 'background_order': 3},
                         'Motifs': {'schedule': lambda i: True, 'scaling': ('scaling_const', 1.0)}}
        func = motif.MemeScoringFunction(organism, membership, ratio_matrix,
                                         config_params=config_params)
        iteration_result = { 'iteration': 100 }
        matrix = func.compute(iteration_result)
예제 #9
0
    def read_edges2(filename, organism, ratios):
        """just read a preprocessed file, much faster to debug"""
        logging.info("stringdb.read_edges2()")
        dfile = util.read_dfile(filename, sep)
        result = []
        max_score = 0.0
        thesaurus = organism.thesaurus()
        if ratios:
            cano_genes = {thesaurus[row] for row in ratios.row_names
                          if row in thesaurus}
        else:
            cano_genes = None

        num_ignored = 0

        for line in dfile.lines:
            node1 = patches.patch_string_gene(organism_code, line[0])
            node2 = patches.patch_string_gene(organism_code, line[1])
            score = float(line[2])
            max_score = max(score, max_score)

            if can_add_edge(node1, node2, thesaurus, cano_genes):
                result.append((intern(node1), intern(node2), score))
            else:
                num_ignored += 1

        if not normalized:
            result = normalize_edges_to_max_score(result, max_score)

        logging.info("stringdb.read_edges2(), %d edges read, %d edges ignored",
                     len(result), num_ignored)
        return result
예제 #10
0
def prepare_ensemble_matrix(ratiofile, outdir, n, kmin):
    matrix_factory = DataMatrixFactory([nochange_filter,
                                        center_scale_filter])
    if os.path.exists(ratiofile):
        infile = util.read_dfile(ratiofile, has_header=True, quote='\"')
        matrix = matrix_factory.create_from(infile)
        split_matrix(matrix, outdir, n, kmin, matrix.num_columns)
예제 #11
0
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory(
            [dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True,
                                 quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = make_halo(self.ratio_matrix, self.search_distances,
                                  self.scan_distances)
        self.config_params = {
            'memb.min_cluster_rows_allowed': 3,
            'memb.max_cluster_rows_allowed': 70,
            'multiprocessing': False,
            'memb.clusters_per_row': 2,
            'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
            'num_clusters': 43,
            'output_dir': 'out',
            'remap_network_nodes': False,
            'num_iterations': 2000,
            'debug': False
        }
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = {'iteration': 51}
예제 #12
0
    def test_motif_scoring(self):
        """tests the motif scoring in integration"""
        search_distances = {'upstream': (-20, 150)}
        scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('halo_ratios5.tsv', has_header=True, quote='\"')
        ratio_matrix = matrix_factory.create_from(infile)
        meme_suite = meme.MemeSuite430(remove_tempfiles=True)
        sequence_filters = [
            motif.unique_filter,
            motif.get_remove_low_complexity_filter(meme_suite),
            motif.get_remove_atgs_filter(search_distances['upstream'])]

        organism = make_halo(ratio_matrix, search_distances, scan_distances)
        membership = FakeMembership()
        config_params = {'memb.min_cluster_rows_allowed': 3,
                         'memb.max_cluster_rows_allowed': 70,
                         'multiprocessing': False,
                         'num_clusters': 1,
                         'output_dir': 'out',
                         'num_iterations': 2000}
        func = motif.MemeScoringFunction(organism, membership, ratio_matrix,
                                         meme_suite,
                                         sequence_filters=sequence_filters,
                                         scaling_func=lambda iter: 1.0,
                                         num_motif_func=motif.default_nmotif_fun,
                                         config_params=config_params)
        iteration_result = { 'iteration': 100 }
        matrix = func.compute(iteration_result)
        """
예제 #13
0
 def test_read_with_quotes(self):
     """Reads a semicolon delimited file with quotes"""
     dfile = util.read_dfile("testdata/withquotes.ssv", sep=';',
                             has_header=False, comment='#', quote='"')
     lines = dfile.lines
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
예제 #14
0
    def test_motif_scoring(self):
        """tests the motif scoring in integration"""
        search_distances = {'upstream': (-20, 150)}
        scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True, quote='\"')
        ratio_matrix = matrix_factory.create_from(infile)
        meme_suite = meme.MemeSuite430(remove_tempfiles=True)
        sequence_filters = [
            motif.unique_filter,
            motif.get_remove_low_complexity_filter(meme_suite),
            motif.get_remove_atgs_filter(search_distances['upstream'])]

        organism = make_halo(ratio_matrix, search_distances, scan_distances)
        membership = FakeMembership()
        config_params = {'memb.min_cluster_rows_allowed': 3,
                         'memb.max_cluster_rows_allowed': 70,
                         'multiprocessing': False,
                         'num_clusters': 1,
                         'output_dir': 'out',
                         'debug': False,
                         'num_iterations': 2000}
        func = motif.MemeScoringFunction(organism, membership, ratio_matrix,
                                         meme_suite,
                                         sequence_filters=sequence_filters,
                                         scaling_func=lambda iter: 1.0,
                                         num_motif_func=lambda iter: 1,
                                         update_in_iteration=lambda x: True,
                                         motif_in_iteration=lambda x: True,
                                         config_params=config_params)
        iteration_result = { 'iteration': 100 }
        matrix = func.compute(iteration_result)
        """
예제 #15
0
    def __sequences_for_genes(self, seqtype, genes, distance):
        """retrieves the specified sequences from the supplied genomic data"""
        if not seqtype in self.__seqs:
            logging.info('loading %s sequences' % seqtype)
            dfile = util.read_dfile(self.__seq_filenames[seqtype], sep=',')
            self.__seqs[seqtype] = {}
            for line in dfile.lines:
                self.__seqs[seqtype][line[0].upper()] = line[1].upper()
            logging.info('loaded %i %s sequences' %
                         (len(self.__seqs[seqtype]), seqtype))

        result = {}
        for alias in genes:
            if alias in self.thesaurus():
                gene = self.thesaurus()[alias]
                if gene in self.__seqs[seqtype]:
                    # note that we have to return the sequence as a (location, sequence)
                    # pair even if we do not actually use the Location
                    result[gene] = (st.Location(gene, 0, 0, False),
                                    self.__seqs[seqtype][gene])
                else:
                    #logging.warn("Gene '%s' not found in 3' UTRs", gene)
                    pass
            else:
                #logging.warn("Alias '%s' not in thesaurus !", alias)
                pass
        return result
예제 #16
0
def prepare_ensemble_matrix(ratiofile, outdir, n, kmin):
    matrix_factory = DataMatrixFactory([nochange_filter,
                                        center_scale_filter])
    if os.path.exists(ratiofile):
        infile = util.read_dfile(ratiofile, has_header=True, quote='\"')
        matrix = matrix_factory.create_from(infile)
        split_matrix(matrix, outdir, n, kmin, matrix.num_columns)
예제 #17
0
 def test_read_with_tabs(self):
     """Reads a tab delimited file"""
     dfile = util.read_dfile("testdata/simple.tsv")
     lines = dfile.lines
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
     self.assertIsNone(dfile.header)
예제 #18
0
    def __sequences_for_genes(self, seqtype, genes, distance):
        """retrieves the specified sequences from the supplied genomic data"""
        if not seqtype in self.__seqs:
            logging.info('loading %s sequences' % seqtype)
            dfile = util.read_dfile(self.__seq_filenames[seqtype], sep=',')
            self.__seqs[seqtype] = {}
            for line in dfile.lines:
                self.__seqs[seqtype][line[0].upper()] = line[1].upper()
            logging.info('loaded %i %s sequences' % (len(self.__seqs[seqtype]), seqtype))

        result = {}
        for alias in genes:
            if alias in self.thesaurus():
                gene = self.thesaurus()[alias]
                if gene in self.__seqs[seqtype]:
                    # note that we have to return the sequence as a (location, sequence)
                    # pair even if we do not actually use the Location
                    result[gene] = (st.Location(gene, 0, 0, False), self.__seqs[seqtype][gene])
                else:
                    #logging.warn("Gene '%s' not found in 3' UTRs", gene)
                    pass
            else:
                #logging.warn("Alias '%s' not in thesaurus !", alias)
                pass
        return result
예제 #19
0
 def test_read_with_semicolon_header_and_comments(self):
     """Reads a semicolon delimited file with a header and comments"""
     dfile = util.read_dfile("testdata/withcomments.ssv", sep=';',
                             has_header=True, comment='#')
     lines = dfile.lines
     self.assertEquals(2, len(lines))
     self.assertEquals(["header1", "header2"], dfile.header)
예제 #20
0
 def read_edges2(filename):
     """just read a preprocessed file, much faster to debug"""
     logging.info("\x1b[31mstringdb:\t\x1b[0mreading interaction network - stringdb.read_edges2()")
     dfile = util.read_dfile(filename, sep)
     result = []
     for line in dfile.lines:
         result.append((line[0], line[1], float(line[2])))
     return result
예제 #21
0
 def __make_ref_operon_pairs(self):
     """returns reference operon pairs for comparison"""
     reffile = util.read_dfile('testdata/operon_reftable.tsv',
                               has_header=True, quote='"')
     refpairs = []
     for line in reffile.lines:
         refpairs.append((line[1], line[2]))
     return refpairs
예제 #22
0
 def read_edges3(filename):
     """just read a preprocessed file, much faster to debug"""
     logging.info("stringdb.read_edges3()")
     dfile = util.read_dfile(filename, sep=",", has_header=True, quote='"')
     result = []
     for line in dfile.lines:
         result.append([line[1], line[2], float(line[3])])
     return result
예제 #23
0
 def read_csv(cls, name, infile, cutoff=None, sep=','):
     """reads a set from a CSV file"""
     dfile = util.read_dfile(infile, sep)
     sets = {}
     for line in dfile.lines:
         if line[0] not in sets:
             sets[line[0]] = EnrichmentSet('discrete')
         sets[line[0]].add(line[1].upper(), 1)
     return SetType(name, sets)
예제 #24
0
 def __make_ref_operon_pairs(self):
     """returns reference operon pairs for comparison"""
     reffile = util.read_dfile('testdata/operon_reftable.tsv',
                               has_header=True,
                               quote='"')
     refpairs = []
     for line in reffile.lines:
         refpairs.append((line[1], line[2]))
     return refpairs
예제 #25
0
 def read_csv(cls, name, infile, cutoff=None, sep=','):
     """reads a set from a CSV file"""
     dfile = util.read_dfile(infile, sep)
     sets = {}
     for line in dfile.lines:
         if line[0] not in sets:
             sets[line[0]] = EnrichmentSet('discrete')
         sets[line[0]].add(line[1].upper(), 1)
     return SetType(name, sets)
예제 #26
0
 def test_read_with_empty_lines(self):
     """Reads a semicolon delimited file containing emptylines"""
     dfile = util.read_dfile("testdata/withemptylines.ssv", sep=';',
                             has_header=True, comment='#', quote='"')
     lines = dfile.lines
     self.assertEquals(["header1", "header2"], dfile.header)
     self.assertEquals(2, len(lines))
     self.assertEquals(["value11", "value12"], lines[0])
     self.assertEquals(["value21", "value22"], lines[1])
예제 #27
0
 def read_edges2(filename):
     """just read a preprocessed file, much faster to debug"""
     logging.info("stringdb.read_edges2()")
     dfile = util.read_dfile(filename, sep)
     result = []
     for line in dfile.lines:
         result.append([patches.patch_string_gene(organism_code, line[0]),
                        patches.patch_string_gene(organism_code, line[1]),
                        float(line[2])])
     return result
예제 #28
0
    def make_microbe(self):
        """returns the organism object to work on"""
        keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
        gofile = util.read_dfile(GO_FILE_PATH)
        rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, self['cache_dir'])
        mo_db = microbes_online.MicrobesOnline()
        stringfile = self.config_params['string_file']
        kegg_mapper = org.make_kegg_code_mapper(keggfile)
        rsat_mapper = org.make_rsat_organism_mapper(rsatdb)

        # automatically download STRING file
        if stringfile == None:
            rsat_info = rsat_mapper(kegg_mapper(self['organism_code']))
            ncbi_code = rsat_info.taxonomy_id
            print "NCBI CODE IS: ", ncbi_code
            url = STRING_URL_PATTERN % ncbi_code
            stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
            self['string_file'] = stringfile
            logging.info("Automatically using STRING file in '%s'", stringfile)
            util.get_url_cached(url, stringfile)

        nw_factories = []
        if stringfile != None:
            nw_factories.append(stringdb.get_network_factory2(
                    self['organism_code'], stringfile, 0.5))
        else:
            logging.warn("no STRING file specified !")

        nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratio_matrix.num_rows / 20,
                weight=0.5))

        org_factory = org.MicrobeFactory(kegg_mapper,
                                         rsat_mapper,
                                         org.make_go_taxonomy_mapper(gofile),
                                         mo_db,
                                         nw_factories)
        return org_factory.create(self['organism_code'],
                                  self['search_distances'],
                                  self['scan_distances'])
예제 #29
0
def create_from_delimited_file2(dfile):
    """creates a thesaurus from a delimited file where the format is
    <original>SEPARATOR<alt1>;<alt2>;...
    ..."""
    if isinstance(dfile, str):
        dfile = util.read_dfile(dfile, sep=',', has_header=False)
    result = {}
    for line in dfile.lines:
        original = line[0].upper()  # original should map to itself
        result[original] = original
        for alternative in line[1].split(';'):
            result[alternative.upper()] = original
    return result
예제 #30
0
def create_from_delimited_file2(dfile):
    """creates a thesaurus from a delimited file where the format is
    <original>SEPARATOR<alt1>;<alt2>;...
    ..."""
    if isinstance(dfile, str):
        dfile = util.read_dfile(dfile, sep=',', has_header=False)
    result = {}
    for line in dfile.lines:
        original = line[0].upper()  # original should map to itself
        result[original] = original
        for alternative in line[1].split(';'):
            result[alternative.upper()] = original
    return result
예제 #31
0
 def read_edges2(filename):
     """just read a preprocessed file, much faster to debug"""
     logging.info("stringdb.read_edges2()")
     dfile = util.read_dfile(filename, sep)
     result = []
     max_score = 0.0
     for line in dfile.lines:
         score = float(line[2])
         max_score = max(score, max_score)
         result.append((patches.patch_string_gene(organism_code, line[0]),
                        patches.patch_string_gene(organism_code,
                                                  line[1]), score))
     if not normalized:
         normalize_edges_to_max_score(result, max_score)
     return result
예제 #32
0
def make_halo(ratio_matrix, search_distances, scan_distances):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR        )
    mo_db = microbes_online.MicrobesOnline()
    stringfile = 'string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(stringdb.get_network_factory2('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    nw_factories.append(microbes_online.get_network_factory(
            mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5))

    org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                     org.make_rsat_organism_mapper(rsatdb),
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db,
                                     nw_factories)

    return org_factory.create('hal', search_distances, scan_distances)
예제 #33
0
 def read_edges2(filename):
     """just read a preprocessed file, much faster to debug"""
     logging.info("stringdb.read_edges2()")
     dfile = util.read_dfile(filename, sep)
     result = []
     max_score = 0.0
     for line in dfile.lines:
         score = float(line[2])
         max_score = max(score, max_score)
         result.append((patches.patch_string_gene(organism_code, line[0]),
                        patches.patch_string_gene(organism_code, line[1]),
                        score))
     if not normalized:
         normalize_edges_to_max_score(result, max_score)
     return result
예제 #34
0
def create_from_delimited_file2(dfile, case_sensitive):
    """creates a thesaurus from a delimited file where the format is
    <original>SEPARATOR<alt1>;<alt2>;...
    ..."""
    def fix_case(s):
        return s if case_sensitive else s.upper()

    if isinstance(dfile, str):
        dfile = util.read_dfile(dfile, sep=',', has_header=False)
    result = {}
    for line in dfile.lines:
        original = intern(fix_case(line[0]))  # original should map to itself
        result[original] = original
        for alternative in line[1].split(';'):
            result[intern(fix_case(alternative))] = original
    return result
예제 #35
0
def create_from_delimited_file2(dfile, case_sensitive):
    """creates a thesaurus from a delimited file where the format is
    <original>SEPARATOR<alt1>;<alt2>;...
    ..."""
    def fix_case(s):
        return s if case_sensitive else s.upper()

    if isinstance(dfile, str):
        dfile = util.read_dfile(dfile, sep=',', has_header=False)
    result = {}
    for line in dfile.lines:
        original = intern(fix_case(line[0]))  # original should map to itself
        result[original] = original
        for alternative in line[1].split(';'):
            result[intern(fix_case(alternative))] = original
    return result
예제 #36
0
    def test_motif_scoring(self):
        """tests the motif scoring in integration"""
        search_distances = {'upstream': (-20, 150)}
        scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory(
            [dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True,
                                 quote='\"')
        ratio_matrix = matrix_factory.create_from(infile)
        organism = testutil.make_halo(search_distances, scan_distances,
                                      ratio_matrix)
        membership = FakeMembership()
        config_params = {
            'memb.min_cluster_rows_allowed': 3,
            'memb.max_cluster_rows_allowed': 70,
            'multiprocessing': False,
            'num_clusters': 1,
            'output_dir': 'out',
            'debug': {},
            'search_distances': {
                'upstream': (-20, 150)
            },
            'num_iterations': 2000,
            'MEME': {
                'schedule': lambda i: True,
                'version': '4.3.0',
                'global_background': False,
                'arg_mod': 'zoops',
                'nmotifs_rvec':
                'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                'use_revcomp': 'True',
                'max_width': 24,
                'background_order': 3
            },
            'Motifs': {
                'schedule': lambda i: True,
                'scaling': ('scaling_const', 1.0)
            }
        }
        func = motif.MemeScoringFunction(organism,
                                         membership,
                                         ratio_matrix,
                                         config_params=config_params)
        iteration_result = {'iteration': 100}
        matrix = func.compute(iteration_result)
예제 #37
0
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('halo_ratios5.tsv', has_header=True, quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = make_halo(self.ratio_matrix, self.search_distances, self.scan_distances)
        self.config_params = {'memb.min_cluster_rows_allowed': 3,
                              'memb.max_cluster_rows_allowed': 70,
                              'multiprocessing': False,
                              'memb.clusters_per_row': 2,
                              'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
                              'num_clusters': 43,
                              'num_iterations': 2000}
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = { 'iteration': 51 }
예제 #38
0
def read_matrix(filename):
    """reads the data matrix from a file"""
    controls = read_controls()
    rug = read_rug(lambda row: row[1] in RUG_PROPS)
    columns_to_use = list(set(rug + controls))

    # pass the column filter as the first filter to the DataMatrixFactory,
    # so normalization will be applied to the submatrix
    matrix_factory = dm.DataMatrixFactory([
            lambda matrix: matrix.submatrix_by_name(
                column_names=columns_to_use)])
    infile = util.read_dfile(filename, sep=',', has_header=True, quote="\"")
    matrix = matrix_factory.create_from(infile)

    column_groups = {1: range(matrix.num_columns)}
    if SELECT_ROWS:
        select_rows = select_probes(matrix, 2000, column_groups)
        matrix = matrix.submatrix_by_rows(select_rows)
    return intensities_to_ratios(matrix, controls, column_groups)
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True, quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = testutil.make_halo(self.search_distances, self.scan_distances,
                                           self.ratio_matrix)
        self.config_params = {'memb.min_cluster_rows_allowed': 3,
                              'memb.max_cluster_rows_allowed': 70,
                              'multiprocessing': False,
                              'num_cores': None,
                              'memb.clusters_per_row': 2,
                              'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
                              'num_clusters': 43,
                              'output_dir': 'out',
                              'remap_network_nodes': False,
                              'use_BSCM': False,
                              'num_iterations': 2000,
                              'debug': {},
                              'search_distances': {'upstream': (-20, 150)},
                              'Columns': {'schedule': lambda i: True },
                              'Rows': {'schedule': lambda i: True, 'scaling': ('scaling_const', 6.0) },
                              'Motifs': {'schedule': lambda i: True,
                                         'scaling': ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)')},
                              'MEME': {'version': '4.3.0',
                                       'global_background': False,
                                       'schedule': lambda i: True,
                                       'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                                       'max_width': 24, 'arg_mod': 'zoops',
                                       'background_order': 3, 'use_revcomp': 'True'},
                              'Networks': {'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)')}}
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = { 'iteration': 51, 'score_means': {} }
예제 #40
0
 def test_get_non_existing(self):
     """retrieve None for a non-existing organism"""
     dfile = util.read_dfile(PROT2TAXID_FILE_PATH, sep='\t',
                             has_header=False)
     mapper = org.make_go_taxonomy_mapper(dfile)
     self.assertIsNone(mapper('does not exist'))
예제 #41
0
 def test_get_existing(self):
     """retrieve an existing id"""
     dfile = util.read_dfile(PROT2TAXID_FILE_PATH, sep='\t',
                             has_header=False)
     mapper = org.make_go_taxonomy_mapper(dfile)
     self.assertEquals('64091', mapper('Halobacterium salinarium'))
예제 #42
0
 def test_get_non_existing_organism(self):
     """retrieve non-existing organism"""
     dfile = util.read_dfile(TAXONOMY_FILE_PATH, sep='\t',
                             has_header=True, comment='#')
     mapper = org.make_kegg_code_mapper(dfile)
     self.assertIsNone(mapper('nope'))
예제 #43
0
 def test_get_existing_organism(self):
     """retrieve existing organism"""
     dfile = util.read_dfile(TAXONOMY_FILE_PATH, sep='\t',
                             has_header=True, comment='#')
     mapper = org.make_kegg_code_mapper(dfile)
     self.assertEquals('Helicobacter pylori 26695', mapper('hpy'))
예제 #44
0
 def __read_colscores_refresult(self):
     dfile = util.read_dfile('testdata/column_scores_refresult.tsv',
                             has_header=True, quote='"')
     return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
예제 #45
0
 def __read_ratios(self):
     dfile = util.read_dfile('testdata/row_scores_testratios.tsv',
                             has_header=True)
     return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
예제 #46
0
    def make_microbe(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()

        if os.path.exists(USER_KEGG_FILE_PATH):
            keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
        elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
            keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
        else:
            raise Exception('KEGG file not found !!')

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception(
                    'override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'],
                                    self['ncbi_code'])
        else:
            rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(
                self['operon_file'])
        else:
            logging.info(
                "attempting automatic download of operons from Microbes Online"
            )
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        kegg_mapper = org.make_kegg_code_mapper(keggfile)
        rsat_mapper = org.make_rsat_organism_mapper(rsatdb)
        ncbi_code = self['ncbi_code']
        nw_factories = []

        # do we use STRING ?
        if self['donetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = rsat_mapper(kegg_mapper(self['organism_code']),
                                            self['rsat_organism'])
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'",
                             stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(
                stringdb.get_network_factory2(self['organism_code'],
                                              stringfile, 0.5))

        # do we use operons ?
        if self['donetworks'] and self['use_operons']:
            logging.info('adding operon network factory')
            nw_factories.append(
                microbes_online.get_network_factory(
                    mo_db,
                    max_operon_size=self.ratio_matrix.num_rows / 20,
                    weight=0.5))

        org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper,
                                         org.make_go_taxonomy_mapper(gofile),
                                         mo_db, nw_factories,
                                         self['ncbi_code'])
        return org_factory.create(self['organism_code'],
                                  self['search_distances'],
                                  self['scan_distances'], self['use_operons'],
                                  self['rsat_organism'], self.ratio_matrix)
예제 #47
0
    def read_edges2(filename, organism, ratios):
        """just read a preprocessed file, much faster to debug"""
        logging.info("stringdb.read_edges2()")
        dfile = util.read_dfile(filename, sep)
        logging.info("Finished loading %s", filename)
        result = []
        max_score = 0.0
        thesaurus = organism.thesaurus()
        if ratios:
            gene_lut = {}
            for row_name in ratios.row_names:
                if row_name in thesaurus:
                    gene_lut[thesaurus[row_name]] = row_name
                gene_lut[
                    row_name] = row_name  #A node should always map to itself
            cano_genes = gene_lut.keys()
        else:
            gene_lut = None
            cano_genes = None

        num_ignored = 0
        keep_bool = {
        }  #Big Speedup: Use to search thesaurus and cano_genes only once for each gene
        idx = 1  #Used to display progress
        for line in dfile.lines:
            #This can be slow, display progress every 5%
            frac = idx % (len(dfile.lines) / 20)
            idx += 1
            if frac == 0:
                logging.info("Processing network %d%%",
                             round(100 * float(idx) / len(dfile.lines)))

            node1 = patches.patch_string_gene(organism_code, line[0])
            node2 = patches.patch_string_gene(organism_code, line[1])
            for node in (node1, node2):
                if not node in keep_bool:
                    if cano_genes is not None:
                        keep_bool[node] = node in thesaurus and thesaurus[
                            node] in cano_genes
                    else:
                        keep_bool[node] = node in thesaurus

                    #Add this node to the lut if it is not already there.
                    if (not gene_lut is None) and (not node in gene_lut):
                        gene_lut[node] = node
                        if node in thesaurus:
                            gene_lut[thesaurus[node]] = node

            score = float(line[2])
            max_score = max(score, max_score)

            #if can_add_edge(node1, node2, thesaurus, cano_genes):
            if keep_bool[node1] and keep_bool[node2]:
                #2/18/15 SD.  Translate nodes into names in ratio rows using gene_lut
                #   This will let the ratios matrix define how the genes are named
                if gene_lut is None:
                    new_edge = (intern(node1), intern(node2), score)
                else:
                    new_edge = (intern(gene_lut[node1]),
                                intern(gene_lut[node2]), score)
                #logging.info("Adding edge %s - %s - %f", new_edge[0], new_edge[1], new_edge[2])
                result.append(new_edge)
            else:
                num_ignored += 1

        if not normalized:
            result = normalize_edges_to_max_score(result, max_score)

        logging.info("stringdb.read_edges2(), %d edges read, %d edges ignored",
                     len(result), num_ignored)

        #Write file to be used later?
        #outfile = util.make_delimited_file_from_lines(lines, sep, has_header, comment, quote)

        return result
def read_matrix(filename):
    """reads a matrix file"""
    infile = util.read_dfile(filename, has_header=True, quote='\"')
    return dm.DataMatrixFactory([]).create_from(
        infile, case_sensitive=True).sorted_by_row_name()
    def setUp(self):  # pylint; disable-msg=C0103
        """test fixture"""
        self.search_distances = {'upstream': (-20, 150)}
        self.scan_distances = {'upstream': (-30, 250)}

        matrix_factory = dm.DataMatrixFactory(
            [dm.nochange_filter, dm.center_scale_filter])
        infile = util.read_dfile('example_data/hal/halo_ratios5.tsv',
                                 has_header=True,
                                 quote='\"')
        self.ratio_matrix = matrix_factory.create_from(infile)
        self.organism = testutil.make_halo(self.search_distances,
                                           self.scan_distances,
                                           self.ratio_matrix)
        self.config_params = {
            'memb.min_cluster_rows_allowed': 3,
            'memb.max_cluster_rows_allowed': 70,
            'multiprocessing': False,
            'num_cores': None,
            'memb.clusters_per_row': 2,
            'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)),
            'num_clusters': 43,
            'output_dir': 'out',
            'remap_network_nodes': False,
            'use_BSCM': False,
            'num_iterations': 2000,
            'debug': {},
            'search_distances': {
                'upstream': (-20, 150)
            },
            'Columns': {
                'schedule': lambda i: True
            },
            'Rows': {
                'schedule': lambda i: True,
                'scaling': ('scaling_const', 6.0)
            },
            'Motifs': {
                'schedule': lambda i: True,
                'scaling':
                ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)')
            },
            'MEME': {
                'version': '4.3.0',
                'global_background': False,
                'schedule': lambda i: True,
                'nmotifs_rvec':
                'c(rep(1, num_iterations/3), rep(2, num_iterations/3))',
                'max_width': 24,
                'arg_mod': 'zoops',
                'background_order': 3,
                'use_revcomp': 'True'
            },
            'Networks': {
                'schedule':
                lambda i: True,
                'scaling':
                ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)')
            }
        }
        self.membership = self.__read_members()  # relies on config_params
        self.iteration_result = {'iteration': 51, 'score_means': {}}
예제 #50
0
 def test_read_with_tabs_and_header(self):
     """Reads a tab delimited file with a header"""
     dfile = util.read_dfile("testdata/simple.tsv", has_header=True)
     lines = dfile.lines
     self.assertEquals(1, len(lines))
     self.assertEquals(["value11", "value12"], dfile.header)