def __init__(self, ratios, args): self.__membership = None self.__organism = None self.config_params = args self.ratios = ratios if args['resume']: self.row_seeder = memb.make_db_row_seeder(args['out_database']) self.column_seeder = memb.make_db_column_seeder(args['out_database']) else: self.row_seeder = memb.make_kmeans_row_seeder(args['num_clusters']) self.column_seeder = microarray.seed_column_members self.__conn = None today = date.today() logging.info('Input matrix has # rows: %d, # columns: %d', ratios.num_rows, ratios.num_columns) logging.info("# clusters/row: %d", args['memb.clusters_per_row']) logging.info("# clusters/column: %d", args['memb.clusters_per_col']) logging.info("# CLUSTERS: %d", args['num_clusters']) logging.info("use operons: %d", args['use_operons']) if args['MEME']['version']: logging.info('using MEME version %s', args['MEME']['version']) else: logging.error('MEME not detected - please check')
def __init__(self, ratios, args_in): self.__membership = None self.__organism = None self.config_params = args_in self.ratios = ratios if args_in['resume']: self.row_seeder = memb.make_db_row_seeder(args_in['out_database']) if args_in['new_data_file'] == True: #data file has changed self.column_seeder = microarray.seed_column_members else: self.column_seeder = memb.make_db_column_seeder(args_in['out_database']) else: self.row_seeder = memb.make_kmeans_row_seeder(args_in['num_clusters']) self.column_seeder = microarray.seed_column_members self.__conn = None today = date.today() logging.info('Input matrix has # rows: %d, # columns: %d', ratios.num_rows, ratios.num_columns) logging.info("# clusters/row: %d", args_in['memb.clusters_per_row']) logging.info("# clusters/column: %d", args_in['memb.clusters_per_col']) logging.info("# CLUSTERS: %d", args_in['num_clusters']) logging.info("use operons: %d", args_in['use_operons']) if args_in['MEME']['version']: logging.info('using MEME version %s', args_in['MEME']['version']) else: logging.error('MEME not detected - please check')
def __init__(self, organism_code, ratio_matrix, string_file=None, num_clusters=None): logging.basicConfig(format=LOG_FORMAT, datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) self.__membership = None self.__organism = None self.config_params = {} self.ratio_matrix = ratio_matrix.sorted_by_row_name() # membership update default parameters # these come first, since a lot depends on clustering numbers self['memb.clusters_per_row'] = 2 if num_clusters == None: num_clusters = int(round(self.ratio_matrix.num_rows * self['memb.clusters_per_row'] / 20.0)) #self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0)) self['memb.clusters_per_col'] = int(round(num_clusters / 2.0)) self['memb.prob_row_change'] = 0.5 self['memb.prob_col_change'] = 1.0 self['memb.max_changes_per_row'] = 1 self['memb.max_changes_per_col'] = 5 self['organism_code'] = organism_code self['num_clusters'] = num_clusters logging.info("# CLUSTERS: %d", self['num_clusters']) # defaults self.row_seeder = memb.make_kmeans_row_seeder(num_clusters) self.column_seeder = microarray.seed_column_members self['row_scaling'] = 6.0 self['string_file'] = None self['cache_dir'] = CACHE_DIR self['output_dir'] = 'out' self['start_iteration'] = 1 self['num_iterations'] = 2000 self['multiprocessing'] = True # Quantile normalization is false by default in cMonkey-R self['quantile_normalize'] = True # used to select sequences and MEME self['sequence_types'] = ['upstream'] self['search_distances'] = {'upstream': (-20, 150)} # used for background distribution and MAST self['scan_distances'] = {'upstream': (-30, 250)} # membership default parameters self['memb.min_cluster_rows_allowed'] = 3 self['memb.max_cluster_rows_allowed'] = 70 self['string_file'] = string_file self['out_database'] = self['output_dir'] + '/cmonkey_run.db' today = date.today() self.CHECKPOINT_INTERVAL = None self.__checkpoint_basename = "cmonkey-checkpoint-%s-%d%d%d" % ( organism_code, today.year, today.month, today.day)
def __init__(self, organism_code, ratio_matrix, num_clusters=None): self.__membership = None self.__organism = None self.config_params = {} self.ratio_matrix = ratio_matrix.sorted_by_row_name() # membership update default parameters # these come first, since a lot depends on clustering numbers self['memb.clusters_per_row'] = 2 if num_clusters == None: num_clusters = int(round(self.ratio_matrix.num_rows() * self['memb.clusters_per_row'] / 20.0)) self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0)) self['memb.prob_row_change'] = 0.5 self['memb.prob_col_change'] = 1.0 self['memb.max_changes_per_row'] = 1 self['memb.max_changes_per_col'] = 5 self['organism_code'] = organism_code self['num_clusters'] = num_clusters #logging.info("# CLUSTERS: %d", self['num_clusters']) # defaults self.row_seeder = memb.make_kmeans_row_seeder(num_clusters) self.column_seeder = microarray.seed_column_members self['row_scaling'] = 6.0 self['string_file'] = None self['cache_dir'] = CACHE_DIR self['output_dir'] = 'out' self['start_iteration'] = 1 self['num_iterations'] = 2000 self['multiprocessing'] = True # used to select sequences and MEME self['sequence_types'] = ['upstream'] self['search_distances'] = {'upstream': (-20, 150)} # used for background distribution and MAST self['scan_distances'] = {'upstream': (-30, 250)} # membership default parameters self['memb.min_cluster_rows_allowed'] = 3 self['memb.max_cluster_rows_allowed'] = 70 today = date.today() self.CHECKPOINT_INTERVAL = None self.__checkpoint_basename = "cmonkey-checkpoint-%s-%d%d%d" % ( organism_code, today.year, today.month, today.day) print "inited the bare bones main cMonkey instance"
def __init__(self, organism_code, ratio_matrix, num_clusters=None): cmonkey_run.CMonkeyRun.__init__(self, organism_code, ratio_matrix, num_clusters) self.__membership = None self.__organism = None self.config_params = {} self.ratio_matrix = ratio_matrix.sorted_by_row_name() # membership update default parameters # these come first, since a lot depends on clustering numbers self['memb.clusters_per_row'] = 2 if num_clusters == None: num_clusters = int(round(self.ratio_matrix.num_rows() * self['memb.clusters_per_row'] / 20.0)) self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0)) self['memb.prob_row_change'] = 0.5 self['memb.prob_col_change'] = 1.0 self['memb.max_changes_per_row'] = 1 self['memb.max_changes_per_col'] = 5 self['organism_code'] = organism_code self['num_clusters'] = num_clusters logging.info("\x1b[31mMain:\t\x1b[0m# CLUSTERS: %d", self['num_clusters']) # defaults self.row_seeder = memb.make_kmeans_row_seeder(num_clusters) self.column_seeder = microarray.seed_column_members self['row_scaling'] = 6.0 self['string_file'] = None self['cache_dir'] = 'cache' self['output_dir'] = 'out' self['start_iteration'] = 1 self['num_iterations'] = 2000 self['multiprocessing'] = True # membership default parameters self['memb.min_cluster_rows_allowed'] = 10 self['memb.max_cluster_rows_allowed'] = 70 self['sequence_types'] = ['Promoter', '3pUTR'] self['search_distances'] = {'Promoter': (-1000, 200), '3pUTR': (0, 500)} # used for background distribution and MAST self['scan_distances'] = {'Promoter': (-2000, 750), '3pUTR': (0, 750)} logging.info("\x1b[31mMain:\t\x1b[0mcM object initialized") today = date.today() self.CHECKPOINT_INTERVAL = None self.__checkpoint_basename = "cmonkey-checkpoint-%s-%d%d%d" % ( organism_code, today.year, today.month, today.day)
def __init__(self, organism_code, ratio_matrix, string_file=None, num_clusters=None, rsat_organism=None, log_filename=None, remap_network_nodes=False, ncbi_code=None, operon_file=None, rsat_dir=None): logging.basicConfig(format=LOG_FORMAT, datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG, filename=log_filename) self.__membership = None self.__organism = None self.config_params = {} self.ratio_matrix = ratio_matrix # membership update default parameters # these come first, since a lot depends on clustering numbers self['memb.clusters_per_row'] = 2 if num_clusters is None: num_clusters = int( round(self.ratio_matrix.num_rows * self['memb.clusters_per_row'] / 20.0)) if ratio_matrix.num_columns >= 60: self['memb.clusters_per_col'] = int(round(num_clusters / 2.0)) else: self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0)) logging.info("# clusters/row: %d", self['memb.clusters_per_row']) logging.info("# clusters/column: %d", self['memb.clusters_per_col']) self['organism_code'] = organism_code self['num_clusters'] = num_clusters self['use_operons'] = True self['use_string'] = True self['global_background'] = True self['ncbi_code'] = ncbi_code self['remap_network_nodes'] = remap_network_nodes logging.info("# CLUSTERS: %d", self['num_clusters']) logging.info("use operons: %d", self['use_operons']) # defaults self.row_seeder = memb.make_kmeans_row_seeder(num_clusters) self.column_seeder = microarray.seed_column_members # file overrides self['string_file'] = string_file self['operon_file'] = operon_file self['rsat_organism'] = rsat_organism self['rsat_dir'] = rsat_dir # which scoring functions should be active self['donetworks'] = True self['domotifs'] = True today = date.today() self.__checkpoint_basename = "cmonkey-checkpoint-%d%d%d" % ( today.year, today.month, today.day) self['meme_version'] = meme.check_meme_version() if self['meme_version']: logging.info('using MEME version %s', self['meme_version']) else: logging.error('MEME not detected - please check') if os.path.exists(USER_DEFAULT_PIPELINE_PATH): with open(USER_DEFAULT_PIPELINE_PATH) as infile: self['pipeline'] = json.load(infile)
def __init__(self, organism_code, ratio_matrix, string_file=None, num_clusters=None, rsat_organism=None, log_filename=None, remap_network_nodes=False, ncbi_code=None, operon_file=None, rsat_dir=None): logging.basicConfig(format=LOG_FORMAT, datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG, filename=log_filename) self.__membership = None self.__organism = None self.config_params = {} self.ratio_matrix = ratio_matrix # membership update default parameters # these come first, since a lot depends on clustering numbers self['memb.clusters_per_row'] = 2 if num_clusters is None: num_clusters = int(round(self.ratio_matrix.num_rows * self['memb.clusters_per_row'] / 20.0)) if ratio_matrix.num_columns >= 60: self['memb.clusters_per_col'] = int(round(num_clusters / 2.0)) else: self['memb.clusters_per_col'] = int(round(num_clusters * 2.0 / 3.0)) logging.info("# clusters/row: %d", self['memb.clusters_per_row']) logging.info("# clusters/column: %d", self['memb.clusters_per_col']) self['organism_code'] = organism_code self['num_clusters'] = num_clusters self['use_operons'] = True self['use_string'] = True self['global_background'] = True self['ncbi_code'] = ncbi_code self['remap_network_nodes'] = remap_network_nodes logging.info("# CLUSTERS: %d", self['num_clusters']) logging.info("use operons: %d", self['use_operons']) # defaults self.row_seeder = memb.make_kmeans_row_seeder(num_clusters) self.column_seeder = microarray.seed_column_members # file overrides self['string_file'] = string_file self['operon_file'] = operon_file self['rsat_organism'] = rsat_organism self['rsat_dir'] = rsat_dir # which scoring functions should be active self['donetworks'] = True self['domotifs'] = True today = date.today() self.__checkpoint_basename = "cmonkey-checkpoint-%d%d%d" % ( today.year, today.month, today.day) self['meme_version'] = meme.check_meme_version() if self['meme_version']: logging.info('using MEME version %s', self['meme_version']) else: logging.error('MEME not detected - please check') if os.path.exists(USER_DEFAULT_PIPELINE_PATH): with open(USER_DEFAULT_PIPELINE_PATH) as infile: self['pipeline'] = json.load(infile)