def test_set_kmer_freq_promoter(self): k = 7 exp_setting = Configuration() exp_setting.set_kmer_size(kmer_size=k) conn = Pgsql.Common.connect(settings.conn_string_test) #gnid = 58737 gnid = 76 seq_type = 'm1' gs_pep = GeneSequence(gnid=gnid, seq_type=seq_type, is_max_seq_len=True, conn=conn, k=k) print(gs_pep.get_seq_str()) kf = gs_pep.get_kmer_freq(k=k) kf.print(sort_type=4, limit=10)
def main(argv): # global exp_setting exp_setting = Configuration() debug_mode = list() reduced_mode = False test_mode = False seq_type = None gene_prot = None feature_set = list() percentile_range = list() args = parser.parse_args(argv[1:]) if len(argv) <= 1: parser.parse_args(['--help']) return # set version exp_setting.set_version(settings.DEV_VERSION) # Show Version version = exp_setting.get_version() print('Version:', version.get_version()) # Get Cutoffs cutoffs = Cutoffs() cutoffs.query_cutoffs('95, 0, -5') exp_setting.set_cutoffs(cutoffs) print('Cutoffs data Initialized.') # enable debugging mode if args.enable_debug: if set(args.enable_debug) & enable_debug: #debug_mode = [1, 100000] debug_mode = [1, 1000] reduced_mode = True if args.use_real_db: if set(args.use_real_db) & choice_yes: print('USING TEST DB: NO (USEING REAL/PRODUCTION DB)') else: settings.conn_string = settings.conn_string_test print('USING TEST DB: YES') if set(args.test_mode) & choice_yes: exp_setting.set_test_mode(True) print('TEST MODE: YES') else: exp_setting.set_test_mode(False) print('TEST MODE: NO') # ignore zero values if set(args.ignore_zero) & choice_yes: exp_setting.set_ignore_null(True) print('Ignore zero values: YES') else: exp_setting.set_ignore_null(False) print('Ignore zero values: NO') # Gene info loading mode if args.gene_load_mode: if set(args.gene_load_mode) & gene_load_mode_pl: exp_setting.set_gene_load_mode(settings.GN_LD_MODE_PL) print('Gene loading mode: pre-load') elif set(args.gene_load_mode) & gene_load_mode_dl: exp_setting.set_gene_load_mode(settings.GN_LD_MODE_DL) print('Gene loading mode: dynamic load') # sequence type if args.seq_type: if set(args.seq_type) & seq_type_pep: seq_type = 'p' print('sequence type: amino acid (peptide)') elif set(args.seq_type) & seq_type_dna: seq_type = 'd' print('sequence type: DNA') elif set(args.seq_type) & seq_type_pmt: seq_type = 'm1' print('sequence type: Promoter data') # set missing gnids in promoter data exp_setting.set_missing_gnids_in_promoter() elif set(args.seq_type) & seq_type_rda: seq_type = 'p' print('sequence type: Reduced Alphabet') settings.RA_MODE = True else: # default seq_type = 'p' print('sequence type: amino acid (peptide) - Default') exp_setting.set_seq_type(seq_type) # gp_type gp_type = 'g' if args.gp_type: if set(args.gp_type) & gp_type_g: gp_type = 'g' print('gp type: g') elif set(args.gp_type) & gp_type_p: gp_type = 'p' print('gp type: p') elif set(args.gp_type) & gp_type_b: gp_type = 'b' print('gp type: b') else: gp_type = 'g' print('gp type: g (default)') exp_setting.set_gp_type(gp_type) # assign feature groups if args.feature_group: if set(args.feature_group) & feature_group_gl: print('new feature group: gene low expressed') Features.gene_low_exp() if set(args.feature_group) & feature_group_gh: print('new feature group: gene high expressed, top 5%') Features.gene_high_exp() if set(args.feature_group) & feature_group_gh10: print('new feature group: gene high expressed, top 10%') Features.gene_high_exp_t10() if set(args.feature_group) & feature_group_gt: print('new feature group: gene for each tissue, top 10%') Features.gene_tissues() # feature set if args.feature_set: feature_set = args.feature_set print('feature set: {}'.format(feature_set)) # set negative class mode if args.neg_class_mode: neg_class_mode = args.neg_class_mode print('NEG_CLASS_MODE:', neg_class_mode) if neg_class_mode in (settings.NEG_CLASS_MODE_NOT_P, settings.NEG_CLASS_MODE_RND_S, settings.NEG_CLASS_MODE_RND_M): exp_setting.set_neg_class_mode(neg_class_mode) else: error_mesg = 'NEG_CLASS_MODE:', neg_class_mode, 'is UNKNOWN.' raise ValueError(error_mesg) # set percentile for new feature set if args.percentile: percentile_range = args.percentile.split(',') percentile_range = [int(x) for x in percentile_range] # str -> int type print('Set percnetile range:', args.percentile) # set gp combo configurations feature_set_gp_comb = list() if args.multi_gp: multi_gp_conf = args.multi_gp for conf in multi_gp_conf: print(conf) conf_list = conf.split(':') feature_set_gp_comb.append(conf_list) print(feature_set_gp_comb) # class assignment for features if args.features: if set(args.features) & (settings.FN_GE_N | settings.FN_GE_B | settings.FN_PA_N | settings.FN_PA_B): if len(percentile_range) <= 0: raise ValueError( 'percentile range is empty. Please set percentile range.') ''' It supports adding multiple features at the same time, so it needs to do independently as belows. ''' if set(args.features) & settings.FN_GE_N: print('GE_N') is_top = True gp_type = 'g' feature_set_name = next(iter(settings.FN_GE_N)) for percentile in range(percentile_range[0], percentile_range[1], percentile_range[2]): add_feature_by_percentile( gp_type=gp_type, feature_set_name=feature_set_name, percentile=percentile, is_top=is_top) if set(args.features) & settings.FN_GE_B: print('GE_B') is_top = False gp_type = 'g' feature_set_name = next(iter(settings.FN_GE_B)) for percentile in range(percentile_range[0], percentile_range[1], percentile_range[2]): add_feature_by_percentile( gp_type=gp_type, feature_set_name=feature_set_name, percentile=percentile, is_top=is_top) if set(args.features) & settings.FN_PA_N: print('PA_N') is_top = True gp_type = 'p' feature_set_name = next(iter(settings.FN_PA_N)) for percentile in range(percentile_range[0], percentile_range[1], percentile_range[2]): add_feature_by_percentile( gp_type=gp_type, feature_set_name=feature_set_name, percentile=percentile, is_top=is_top) if set(args.features) & settings.FN_PA_B: print('PA_B') is_top = False gp_type = 'p' feature_set_name = next(iter(settings.FN_PA_B)) for percentile in range(percentile_range[0], percentile_range[1], percentile_range[2]): add_feature_by_percentile( gp_type=gp_type, feature_set_name=feature_set_name, percentile=percentile, is_top=is_top) if set(args.features) & settings.FN_GPCB: print('GE&PA Combination data') for conf in feature_set_gp_comb: add_feature_gp_comb(conf, exp_setting) # build feature vector if args.feature_vector: intervals = [1000] if set(args.feature_vector) & feature_vector: print('build feature vector') fs_set_idx = 0 #build_feature_vector() if reduced_mode: #for i in range(1,58938, interval): for interval in intervals: for i in range(1, 39324, interval): #for i in range(16001,39324, interval): debug_mode = [i, interval] exp_setting.set_debug_mode(debug_mode) build_feature_vector(exp_setting) else: for k in range(3, 8): exp_setting.set_kmer_size(kmer_size=k) exp_setting.set_genes_info(genes_info=None) for fsid in feature_set: # Version Info print('Version:', settings.DEV_VERSION) if fsid == 0: # set feature info with dummy data for small assigned gene at random fs_info = FeatureInfo(fsid=0, fs_name='SM_RND', gp_type='g', class_size=2) # Set assigned genes limit assigned_genes_limit = [ int((x + 23 * fs_set_idx) * 10) for x in range(1, 24) ] exp_setting.set_assigned_genes_limit( assigned_genes_limit) fs_set_idx += 1 else: # get feature set info from DB res_fs_info = Pgsql.Common.select_data( sqls.get_feature_set, (fsid)) fs_info = FeatureInfo( fsid=fsid, fs_name=res_fs_info[0][0].strip(), gp_type=res_fs_info[0][1].strip(), class_size=int(res_fs_info[0][2])) exp_setting.set_fs_info(fs_info) # for test print( '### MESSAGE ### fsid: {}, fs_name: {}, gp_type: {}, class_size: {}' .format(exp_setting.get_fsid(), exp_setting.get_fs_name(), exp_setting.get_gp_type(), exp_setting.get_class_size())) debug_mode = [1, 0] exp_setting.set_debug_mode(debug_mode) build_feature_vector(exp_setting) # single step classification if args.validation_mode: if set(args.validation_mode) & validation_mode_rg: # reduced gene model intervals = [1000, 2000, 3000, 4000, 5000] print('validation - reduced genes model mode') for interval in intervals: for i in range(1, 39324, interval): #for i in range(16001,39324, interval): debug_mode = [i, interval] exp_setting.set_debug_mode(debug_mode) build_feature_vector(debug_mode=debug_mode, gene_prot=gene_prot, seq_type=seq_type)