def make_halo(search_distances, scan_distances, ratios=None): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR, 'Halobacterium sp', 64091) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) stringfile = 'testdata/string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5)) else: logging.warn("no STRING file specified !") if ratios is not None: nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5)) keggorg = util.make_dfile_map(keggfile, 1, 3)['hal'] rsat_organism = rsatdb.get_rsat_organism(keggorg) rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories, search_distances, scan_distances, True, None)
def make_halo(search_distances, scan_distances, ratios=None): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR, 'Halobacterium sp', 64091) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) stringfile = 'testdata/string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append( stringdb.get_network_factory('hal', stringfile, 0.5)) else: logging.warn("no STRING file specified !") if ratios is not None: nw_factories.append( microbes_online.get_network_factory( mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5)) keggorg = util.make_dfile_map(keggfile, 1, 3)['hal'] rsat_organism = rsatdb.get_rsat_organism(keggorg) rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories, search_distances, scan_distances, True, None)
def __get_kegg_data(self): # determine the NCBI code organism_code = self['organism_code'] if os.path.exists(USER_KEGG_FILE_PATH): keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#') elif os.path.exists(SYSTEM_KEGG_FILE_PATH): keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#') else: raise Exception('KEGG file not found !!') kegg_map = util.make_dfile_map(keggfile, 1, 3) kegg2ncbi = util.make_dfile_map(keggfile, 1, 2) if self['ncbi_code'] is None and organism_code in kegg2ncbi: self['ncbi_code'] = kegg2ncbi[organism_code] return self['ncbi_code'], kegg_map[organism_code]
def __make_organism(self): """makes a mock organism with almost real data""" features = {} dfile = util.read_dfile('testdata/Halobacterium_sp_features', comment='--') for line in dfile.lines: features[line[0]] = st.Feature( line[0], line[1], line[2], st.Location(line[3], int(line[4]), int(line[5]), line[6] == 'R')) tfile = util.read_dfile('testdata/Halobacterium_sp_feature_names', comment='--') synonyms = th.create_from_rsat_feature_names(tfile) return MockOrganismWithSynonyms('64091', features, synonyms)
def __make_organism(self): """makes a mock organism with almost real data""" features = {} dfile = util.read_dfile('testdata/Halobacterium_sp_features', comment='--') for line in dfile.lines: features[line[0]] = st.Feature(line[0], line[1], line[2], st.Location(line[3], int(line[4]), int(line[5]), line[6] == 'R')) tfile = util.read_dfile( 'testdata/Halobacterium_sp_feature_names', comment='--') synonyms = th.create_from_rsat_feature_names(tfile) return MockOrganismWithSynonyms('64091', features, synonyms)
def read_ratios(params, args_in): """reading ratios matrix""" if params['normalize_ratios']: if test_data_change(params, args_in) == True: #Turn off the nochange_filter if you're resuming a run an have changed the data matrix ratio_filters = [dm.center_scale_filter] else : ratio_filters = [dm.nochange_filter, dm.center_scale_filter] else: ratio_filters = [] matrix_factory = dm.DataMatrixFactory(ratio_filters) matrix_filename = args_in.ratios if matrix_filename.startswith('http://'): indata = util.read_url(matrix_filename) infile = util.dfile_from_text(indata, has_header=True, quote='\"') else: infile = util.read_dfile(matrix_filename, has_header=True, quote='\"') if params['case_sensitive'] or args_in.case_sensitive: ratios = matrix_factory.create_from(infile, True) else: ratios = matrix_factory.create_from(infile, False) return ratios
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = testutil.make_halo(self.search_distances, self.scan_distances, self.ratio_matrix) self.config_params = { 'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'num_iterations': 2000 } self.membership = self.__read_members() # relies on config_params self.iteration_result = {'iteration': 51}
def test_read_with_tabs(self): """Reads a tab delimited file""" dfile = util.read_dfile("testdata/simple.tsv") lines = dfile.lines self.assertEquals(["value11", "value12"], lines[0]) self.assertEquals(["value21", "value22"], lines[1]) self.assertIsNone(dfile.header)
def test_read_with_quotes(self): """Reads a semicolon delimited file with quotes""" dfile = util.read_dfile("testdata/withquotes.ssv", sep=';', has_header=False, comment='#', quote='"') lines = dfile.lines self.assertEquals(["value11", "value12"], lines[0]) self.assertEquals(["value21", "value22"], lines[1])
def test_read_with_semicolon_header_and_comments(self): """Reads a semicolon delimited file with a header and comments""" dfile = util.read_dfile("testdata/withcomments.ssv", sep=';', has_header=True, comment='#') lines = dfile.lines self.assertEquals(2, len(lines)) self.assertEquals(["header1", "header2"], dfile.header)
def read_ratios(params, args_in): """reading ratios matrix""" if params['normalize_ratios']: if test_data_change(params, args_in) == True: #Turn off the nochange_filter if you're resuming a run an have changed the data matrix ratio_filters = [dm.center_scale_filter] else: ratio_filters = [dm.nochange_filter, dm.center_scale_filter] else: ratio_filters = [] matrix_factory = dm.DataMatrixFactory(ratio_filters) matrix_filename = args_in.ratios if matrix_filename.startswith('http://'): indata = util.read_url(matrix_filename).decode('utf-8') infile = util.dfile_from_text(indata, has_header=True, quote='\"') else: infile = util.read_dfile(matrix_filename, has_header=True, quote='\"') if params['case_sensitive'] or args_in.case_sensitive: ratios = matrix_factory.create_from(infile, True) else: ratios = matrix_factory.create_from(infile, False) return ratios
def prepare_ensemble_matrix(ratiofile, outdir, n, kmin): matrix_factory = DataMatrixFactory([nochange_filter, center_scale_filter]) if os.path.exists(ratiofile): infile = util.read_dfile(ratiofile, has_header=True, quote='\"') matrix = matrix_factory.create_from(infile) split_matrix(matrix, outdir, n, kmin, matrix.num_columns)
def test_motif_scoring(self): """tests the motif scoring in integration""" search_distances = {"upstream": (-20, 150)} scan_distances = {"upstream": (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile("example_data/hal/halo_ratios5.tsv", has_header=True, quote='"') ratio_matrix = matrix_factory.create_from(infile) organism = testutil.make_halo(search_distances, scan_distances, ratio_matrix) membership = FakeMembership() config_params = { "memb.min_cluster_rows_allowed": 3, "memb.max_cluster_rows_allowed": 70, "multiprocessing": False, "num_clusters": 1, "output_dir": "out", "debug": {}, "search_distances": {"upstream": (-20, 150)}, "num_iterations": 2000, "MEME": { "schedule": lambda i: True, "version": "4.3.0", "global_background": False, "arg_mod": "zoops", "nmotifs_rvec": "c(rep(1, num_iterations/3), rep(2, num_iterations/3))", "use_revcomp": "True", "max_width": 24, "background_order": 3, }, "Motifs": {"schedule": lambda i: True, "scaling": ("scaling_const", 1.0)}, } func = motif.MemeScoringFunction(organism, membership, ratio_matrix, config_params=config_params) iteration_result = {"iteration": 100} matrix = func.compute(iteration_result)
def __make_ref_operon_pairs(self): """returns reference operon pairs for comparison""" reffile = util.read_dfile('testdata/operon_reftable.tsv', has_header=True, quote='"') refpairs = [] for line in reffile.lines: refpairs.append((line[1], line[2])) return refpairs
def test_read_with_empty_lines(self): """Reads a semicolon delimited file containing emptylines""" dfile = util.read_dfile("testdata/withemptylines.ssv", sep=';', has_header=True, comment='#', quote='"') lines = dfile.lines self.assertEquals(["header1", "header2"], dfile.header) self.assertEquals(2, len(lines)) self.assertEquals(["value11", "value12"], lines[0]) self.assertEquals(["value21", "value22"], lines[1])
def make_microbe(code): """assemble organism related information and return it to the caller""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR) kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_mapper = org.make_rsat_organism_mapper(rsatdb) rsat_info = rsat_mapper(kegg_mapper(code)) microbedb = MicrobeDB(keggfile, rsatdb, rsat_info) print "NCBI CODE IS: ", rsat_info.taxonomy_id gofile = util.read_dfile(GO_FILE_PATH) mo_db = microbes_online.MicrobesOnline() search_distances = {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, []) organism = org_factory.create(code, search_distances, scan_distances) return microbedb, organism
def make_microbe(code): """assemble organism related information and return it to the caller""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR) kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_mapper(code), None, None) microbedb = MicrobeDB(keggfile, rsatdb, rsat_info) print "NCBI CODE IS: ", rsat_info.taxonomy_id gofile = util.read_dfile(GO_FILE_PATH) mo_db = microbes_online.MicrobesOnline() search_distances= {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, []) organism = org_factory.create(code, search_distances, scan_distances) return microbedb, organism
def __get_kegg_data(self): # determine the NCBI code organism_code = self.config_params['organism_code'] try: kegg_path = resource_filename(Requirement.parse("cmonkey2"), USER_KEGG_FILE_PATH) except DistributionNotFound: kegg_path = USER_KEGG_FILE_PATH keggfile = util.read_dfile(kegg_path, comment='#') kegg_map = util.make_dfile_map(keggfile, 1, 3) kegg2ncbi = util.make_dfile_map(keggfile, 1, 2) if self.config_params['ncbi_code'] is None and organism_code in kegg2ncbi: self.config_params['ncbi_code'] = kegg2ncbi[organism_code] return self.config_params['ncbi_code'], kegg_map[organism_code]
def create_from_delimited_file2(dfile, case_sensitive): """creates a thesaurus from a delimited file where the format is <original>SEPARATOR<alt1>;<alt2>;... ...""" def fix_case(s): return s if case_sensitive else s.upper() if isinstance(dfile, str): dfile = util.read_dfile(dfile, sep=',', has_header=False) result = {} for line in dfile.lines: original = fix_case(line[0]) # original should map to itself result[original] = original for alternative in line[1].split(';'): result[fix_case(alternative)] = original return result
def test_motif_scoring(self): """tests the motif scoring in integration""" search_distances = {'upstream': (-20, 150)} scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') ratio_matrix = matrix_factory.create_from(infile) organism = testutil.make_halo(search_distances, scan_distances, ratio_matrix) membership = FakeMembership() config_params = { 'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_clusters': 1, 'output_dir': 'out', 'debug': {}, 'search_distances': { 'upstream': (-20, 150) }, 'num_iterations': 2000, 'MEME': { 'schedule': lambda i: True, 'version': '4.3.0', 'global_background': False, 'arg_mod': 'zoops', 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'use_revcomp': 'True', 'max_width': 24, 'background_order': 3 }, 'Motifs': { 'schedule': lambda i: True, 'scaling': ('scaling_const', 1.0) } } func = motif.MemeScoringFunction(organism, membership, ratio_matrix, config_params=config_params) iteration_result = {'iteration': 100} matrix = func.compute(iteration_result)
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = testutil.make_halo(self.search_distances, self.scan_distances, self.ratio_matrix) self.config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'num_iterations': 2000} self.membership = self.__read_members() # relies on config_params self.iteration_result = { 'iteration': 51 }
def read_ratios(params, args): """reading ratios matrix""" if params['normalize_ratios']: ratio_filters = [dm.nochange_filter, dm.center_scale_filter] else: ratio_filters = [] matrix_factory = dm.DataMatrixFactory(ratio_filters) matrix_filename = args.ratios if matrix_filename.startswith('http://'): indata = util.read_url(matrix_filename) infile = util.dfile_from_text(indata, has_header=True, quote='\"') else: infile = util.read_dfile(matrix_filename, has_header=True, quote='\"') if params['case_sensitive'] or args.case_sensitive: ratios = matrix_factory.create_from(infile, True) else: ratios = matrix_factory.create_from(infile, False) return ratios
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = testutil.make_halo(self.search_distances, self.scan_distances, self.ratio_matrix) self.config_params = {'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_cores': None, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'output_dir': 'out', 'remap_network_nodes': False, 'use_BSCM': False, 'num_iterations': 2000, 'debug': {}, 'search_distances': {'upstream': (-20, 150)}, 'Columns': {'schedule': lambda i: True }, 'Rows': {'schedule': lambda i: True, 'scaling': ('scaling_const', 6.0) }, 'Motifs': {'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)')}, 'MEME': {'version': '4.3.0', 'global_background': False, 'schedule': lambda i: True, 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'max_width': 24, 'arg_mod': 'zoops', 'background_order': 3, 'use_revcomp': 'True'}, 'Networks': {'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)')}} self.membership = self.__read_members() # relies on config_params self.iteration_result = { 'iteration': 51, 'score_means': {} }
def read_edges2(filename, organism, ratios): """just read a preprocessed file, much faster to debug""" logging.info("stringdb.read_edges2()") dfile = util.read_dfile(filename, sep) logging.info("Finished loading %s", filename) result = [] max_score = 0.0 thesaurus = organism.thesaurus() if ratios: gene_lut = {} for row_name in ratios.row_names: if row_name in thesaurus: gene_lut[thesaurus[row_name]] = row_name gene_lut[row_name] = row_name #A node should always map to itself cano_genes = gene_lut.keys() else: gene_lut = None cano_genes = None num_ignored = 0 keep_node = {} # Big Speedup: Use to search thesaurus and cano_genes only once for each gene idx = 1 # Used to display progress total_nodes = 0 nodes_not_in_thesaurus = 0 nodes_not_in_cano_genes = 0 for line in dfile.lines: #This can be slow, display progress every 5% frac = idx % (len(dfile.lines)/20) idx += 1 if frac == 0: logging.info("Processing network %d%%", round(100*float(idx)/len(dfile.lines))) node1 = patches.patch_string_gene(organism_code, line[0]) node2 = patches.patch_string_gene(organism_code, line[1]) for node in (node1, node2): if not node in keep_node: if cano_genes is not None: keep_node[node] = node in thesaurus and thesaurus[node] in cano_genes else: keep_node[node] = node in thesaurus if not keep_node[node]: if not node in thesaurus: nodes_not_in_thesaurus += 1 elif not thesaurus[node] in cano_genes: nodes_not_in_cano_genes += 1 # Add this node to the lut if it is not already there. if (not gene_lut is None) and (not node in gene_lut): gene_lut[node] = node if node in thesaurus: gene_lut[thesaurus[node]] = node total_nodes += 1 score = float(line[2]) max_score = max(score, max_score) if keep_node[node1] and keep_node[node2]: #2/18/15 SD. Translate nodes into names in ratio rows using gene_lut # This will let the ratios matrix define how the genes are named if gene_lut is None: new_edge = (node1, node2, score) else: new_edge = (gene_lut[node1], gene_lut[node2], score) #logging.info("Adding edge %s - %s - %f", new_edge[0], new_edge[1], new_edge[2]) result.append(new_edge) else: num_ignored += 1 # Warnings if nodes_not_in_thesaurus > 0: logging.warn('%d (out of %d) nodes not found in synonyms', nodes_not_in_thesaurus, total_nodes) if nodes_not_in_cano_genes > 0: logging.warn('%d (out of %d) nodes not found in canonical gene names', nodes_not_in_cano_genes, total_nodes) if not normalized: result = normalize_edges_to_max_score(result, max_score) logging.info("stringdb.read_edges2(), %d edges read, %d edges ignored", len(result), num_ignored) return result
if not args.string and not args.operons: args.nonetworks = True # user overrides in config files if args.config: config.read(args.config) matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) matrix_filename = args.ratios if matrix_filename.startswith('http://'): indata = util.read_url(matrix_filename) infile = util.dfile_from_text(indata, has_header=True, quote='\"') else: infile = util.read_dfile(matrix_filename, has_header=True, quote='\"') matrix = matrix_factory.create_from(infile) infile = None # override number of clusters either on the command line or through # the config file try: num_clusters = config.getint("General", "num_clusters") except: num_clusters = args.numclusters cmonkey_run = cmr.CMonkeyRun(args.organism, matrix, string_file=args.string, rsat_organism=args.rsat_organism, log_filename=args.logfile,
def __read_ratios(self): dfile = util.read_dfile('testdata/row_scores_testratios.tsv', has_header=True) return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
def __read_colscores_refresult(self): dfile = util.read_dfile('testdata/column_scores_refresult.tsv', has_header=True, quote='"') return dm.DataMatrixFactory([]).create_from(dfile, case_sensitive=True)
def make_organism(self): """returns the organism object to work on""" self.__make_dirs_if_needed() ncbi_code, kegg_species = self.__get_kegg_data() if os.path.exists(USER_GO_FILE_PATH): gofile = util.read_dfile(USER_GO_FILE_PATH) elif os.path.exists(SYSTEM_GO_FILE_PATH): gofile = util.read_dfile(SYSTEM_GO_FILE_PATH) else: raise Exception('GO file not found !!') if self['rsat_dir']: if not self['rsat_organism']: raise Exception('override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url']) else: rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file']) else: logging.info("attempting automatic download of operons from Microbes Online") mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] nw_factories = [] is_microbe = self['organism_code'] not in VERTEBRATES # determine the final weights. note: for now, we will just check whether # we have 1 or 2 networks num_networks = 0 if not self['nonetworks'] and self['use_string']: num_networks += 1 if is_microbe and not self['nonetworks'] and self['use_operons']: num_networks += 1 network_weight = 0.0 if num_networks > 0: network_weight = 1.0 / num_networks # do we use STRING ? if not self['nonetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'], None) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s' (URL: %s)", stringfile, url) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append(stringdb.get_network_factory( self['organism_code'], stringfile, network_weight)) # do we use operons ? if is_microbe and not self['nonetworks'] and self['use_operons']: logging.debug('adding operon network factory') nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratios.num_rows / 20, weight=network_weight)) orgcode = self['organism_code'] logging.debug("Creating Microbe object for '%s'", orgcode) rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'], ncbi_code) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] synonyms = None if self['synonym_file'] is not None: synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'], self['case_sensitive']) #New logic: test to see if there's a fastafile. If not, then #Download it from rsat, process it, and then return the new file name is_microbe = True if is_microbe: organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db, nw_factories, self['search_distances'], self['scan_distances'], self['use_operons'], self.ratios, synonyms, self['fasta_file']) else: organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax, nw_factories, self['search_distances'], self['scan_distances'], self.ratios, synonyms, self['fasta_file']) conn = self.__dbconn() with conn: for network in organism.networks(): conn.execute("insert into statstypes values ('network',?)", [network.name]) for sequence_type in self['sequence_types']: conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type]) return organism
def make_organism(self): """returns the organism object to work on""" self.__make_dirs_if_needed() ncbi_code, kegg_species = self.__get_kegg_data() try: go_file_path = resource_filename(Requirement.parse("cmonkey2"), USER_GO_FILE_PATH) except DistributionNotFound: go_file_path = USER_GO_FILE_PATH gofile = util.read_dfile(go_file_path) if self['rsat_dir']: if not self['rsat_organism']: raise Exception( 'override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url']) else: rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile( self['operon_file']) else: logging.info( "attempting automatic download of operons from Microbes Online" ) mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] nw_factories = [] is_microbe = self['organism_code'] not in VERTEBRATES # determine the final weights. note: for now, we will just check whether # we have 1 or 2 networks num_networks = 0 if not self['nonetworks'] and self['use_string']: num_networks += 1 if is_microbe and not self['nonetworks'] and self['use_operons']: num_networks += 1 network_weight = 0.0 if num_networks > 0: network_weight = 1.0 / num_networks # do we use STRING ? if not self['nonetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'], None) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info( "Automatically using STRING file in '%s' (URL: %s)", stringfile, url) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append( stringdb.get_network_factory(self['organism_code'], stringfile, network_weight)) # do we use operons ? if is_microbe and not self['nonetworks'] and self['use_operons']: logging.debug('adding operon network factory') nw_factories.append( microbes_online.get_network_factory( mo_db, max_operon_size=self.ratios.num_rows / 20, weight=network_weight)) orgcode = self['organism_code'] logging.debug("Creating Microbe object for '%s'", orgcode) rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'], ncbi_code) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] synonyms = None if self['synonym_file'] is not None: synonyms = thesaurus.create_from_delimited_file2( self['synonym_file'], self['case_sensitive']) #New logic: test to see if there's a fastafile. If not, then #Download it from rsat, process it, and then return the new file name is_microbe = True if is_microbe: organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db, nw_factories, self['search_distances'], self['scan_distances'], self['use_operons'], self.ratios, synonyms, self['fasta_file']) else: organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax, nw_factories, self['search_distances'], self['scan_distances'], self.ratios, synonyms, self['fasta_file']) conn = self.__dbconn() with conn: for network in organism.networks(): conn.execute("insert into statstypes values ('network',?)", [network.name]) for sequence_type in self['sequence_types']: conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type]) return organism
def read_matrix(filename): """reads a matrix file""" infile = util.read_dfile(filename, has_header=True, quote='\"') return dm.DataMatrixFactory([]).create_from(infile, case_sensitive=True).sorted_by_row_name()
def test_read_with_tabs_and_header(self): """Reads a tab delimited file with a header""" dfile = util.read_dfile("testdata/simple.tsv", has_header=True) lines = dfile.lines self.assertEquals(1, len(lines)) self.assertEquals(["value11", "value12"], dfile.header)
def read_matrix(filename): """reads a matrix file""" infile = util.read_dfile(filename, has_header=True, quote='\"') return dm.DataMatrixFactory([]).create_from( infile, case_sensitive=True).sorted_by_row_name()
def setUp(self): # pylint; disable-msg=C0103 """test fixture""" self.search_distances = {'upstream': (-20, 150)} self.scan_distances = {'upstream': (-30, 250)} matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile('example_data/hal/halo_ratios5.tsv', has_header=True, quote='\"') self.ratio_matrix = matrix_factory.create_from(infile) self.organism = testutil.make_halo(self.search_distances, self.scan_distances, self.ratio_matrix) self.config_params = { 'memb.min_cluster_rows_allowed': 3, 'memb.max_cluster_rows_allowed': 70, 'multiprocessing': False, 'num_cores': None, 'memb.clusters_per_row': 2, 'memb.clusters_per_col': int(round(43 * 2.0 / 3.0)), 'num_clusters': 43, 'output_dir': 'out', 'remap_network_nodes': False, 'use_BSCM': False, 'num_iterations': 2000, 'debug': {}, 'search_distances': { 'upstream': (-20, 150) }, 'Columns': { 'schedule': lambda i: True }, 'Rows': { 'schedule': lambda i: True, 'scaling': ('scaling_const', 6.0) }, 'Motifs': { 'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(0, 1, length=num_iterations*3/4)') }, 'MEME': { 'version': '4.3.0', 'global_background': False, 'schedule': lambda i: True, 'nmotifs_rvec': 'c(rep(1, num_iterations/3), rep(2, num_iterations/3))', 'max_width': 24, 'arg_mod': 'zoops', 'background_order': 3, 'use_revcomp': 'True' }, 'Networks': { 'schedule': lambda i: True, 'scaling': ('scaling_rvec', 'seq(1e-5, 0.5, length=num_iterations*3/4)') } } self.membership = self.__read_members() # relies on config_params self.iteration_result = {'iteration': 51, 'score_means': {}}
if not args.string and not args.operons: args.nonetworks = True # user overrides in config files if args.config: config.read(args.config) matrix_factory = dm.DataMatrixFactory( [dm.nochange_filter, dm.center_scale_filter]) matrix_filename = args.ratios if matrix_filename.startswith('http://'): indata = util.read_url(matrix_filename) infile = util.dfile_from_text(indata, has_header=True, quote='\"') else: infile = util.read_dfile(matrix_filename, has_header=True, quote='\"') matrix = matrix_factory.create_from(infile) infile = None # override number of clusters either on the command line or through # the config file try: num_clusters = config.getint("General", "num_clusters") except: num_clusters = args.numclusters cmonkey_run = cmr.CMonkeyRun(args.organism, matrix, string_file=args.string, rsat_organism=args.rsat_organism,
value = ratios.values[row][col] outfile.write("%d\t%d\t%f\n" % (gene_id, cond_id, value)) if __name__ == '__main__': description = 'addnwportal.py - adding a cMonkey/python run to the database' parser = argparse.ArgumentParser(description=description) parser.add_argument('--resultdir', required=True, help='cMonkey result directory') parser.add_argument('--exptable', help='filename of expression table to generate', default=None) args = parser.parse_args() resultdb = os.path.join(args.resultdir, 'cmonkey_run.db') ratiofile = os.path.join(args.resultdir, 'ratios.tsv.gz') # read the matrix matrix_factory = dm.DataMatrixFactory([dm.nochange_filter, dm.center_scale_filter]) infile = util.read_dfile(ratiofile, has_header=True, quote='\"') ratios = matrix_factory.create_from(infile) # access the run information conn = sqlite3.connect(resultdb) cursor = conn.cursor() cursor.execute('select organism, species, num_iterations, num_clusters from run_infos') orgcode, species, num_iterations, num_clusters = cursor.fetchone() print "organism: %s species: %s iterations: %d clusters: %d" % (orgcode, species, num_iterations, num_clusters) # start populating the database microbedb, organism = make_microbe(orgcode) ncbi_code = microbedb.rsat_info.taxonomy_id ucsc_code = UCSC_MAP[orgcode]