def main(argv=sys.argv): global OPTIONS # so some option parsing option_parser = setup_option_parser() (OPTIONS, args) = option_parser.parse_args(argv) # do some argument parsing if OPTIONS.TEST: run_tests() return 0 if OPTIONS.RAND_SEED is not None: seed(OPTIONS.RAND_SEED) if len(args) != 2: option_parser.error('ANTIBODY is a required argument') # check to make sure our mode is exclusive, and set the default (AMINO) if none is set if sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) > 1: option_parser.error('options --amino, --dna, and --stanfel are mutually exclusive') elif sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) == 0: OPTIONS.AMINO = True # validate the regression method cvopts = {} if OPTIONS.REGRESSOR_METHOD in regressor_classes: cvopts['regressorcls'] = regressor_classes[OPTIONS.REGRESSOR_METHOD] else: option_parser.error('%s not in the list of available regression methods: \n %s' % (OPTIONS.REGRESSOR_METHOD, '\n '.join(regressor_classes.keys()))) if search(r'(?:lar|lasso)$', OPTIONS.REGRESSOR_METHOD): if OPTIONS.NUM_FEATURES < 0: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES cvopts['m'] = OPTIONS.NUM_FEATURES elif OPTIONS.NUM_FEATURES > 0: option_parser.error('--numfeats is a useless parameter for regression method `%s\'' % OPTIONS.REGRESSOR_METHOD) cvopts['logspace'] = OPTIONS.LOGSPACE # validate the antibody argument, currently a hack exists to make PG9/PG16 work # TODO: Fix pg9/16 hax antibody = args[1].strip() valid_antibodies = sorted(OPTIONS.DATA.antibodies, key=lambda x: x.strip()) if antibody not in valid_antibodies: if ' ' + antibody not in valid_antibodies: option_parser.error('%s not in the list of permitted antibodies: \n %s' % (antibody, '\n '.join([ab.strip() for ab in valid_antibodies]))) else: antibody = ' ' + antibody # validate the subtype option valid_subtypes = sorted(OPTIONS.DATA.subtypes, key=lambda x: x.strip().upper()) for subtype in OPTIONS.SUBTYPES: if subtype not in valid_subtypes: option_parser.error('%s not in the list of permitted subtypes: \n %s' % (subtype, '\n '.join([st.strip() for st in valid_subtypes]))) if len(OPTIONS.FILTER) != 0: if OPTIONS.NUM_FEATURES != -1: option_parser.error('--filter and --numfeats are incompatible options') else: OPTIONS.NUM_FEATURES = len(OPTIONS.FILTER) else: # len(OPTIONS.FILTER) == 0 if OPTIONS.NUM_FEATURES == -1: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES # destroy the parser because optparse docs recommend it option_parser.destroy() # use the default DNA HXB2 Reference seq if we define --dna but don't give a new default HXB2 Reference seq fix_hxb2_fasta() # set the util params set_util_params(OPTIONS.HXB2_IDS) # fetch the alphabet, we'll probably need it later alph = Alphabet(mode=Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.DNA if OPTIONS.DNA else Alphabet.AMINO) ab_basename = ''.join(( antibody, '_dna' if OPTIONS.DNA else '_amino', '_clonal' if OPTIONS.CLONAL else '' )) alignment_basename = '_'.join(( ab_basename, OPTIONS.DATA.basename_root, __VERSION__ )) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal = OPTIONS.DATA.seqrecords(antibody, OPTIONS.CLONAL, OPTIONS.DNA) # if clonal isn't supported, fallback to default if clonal != OPTIONS.CLONAL: ab_basename = ''.join(ab_basename.rsplit('_clonal', 1)) alignment_basename = ''.join(alignment_basename.rsplit('_clonal', 1)) sto_filename = alignment_basename + '.sto' alignment = generate_alignment(seqrecords, sto_filename, is_refidx, OPTIONS)[0] ylabeler = Labeler( seqrecord_get_values, lambda seq: is_HXB2(seq) or False, # TODO: again filtration function ) alignment, y, ic50gt = ylabeler(alignment) filter = naivefilter( OPTIONS.MAX_CONSERVATION, OPTIONS.MIN_CONSERVATION, OPTIONS.MAX_GAP_RATIO, ) refidx = alignment_identify_ref(alignment, is_HXB2) builder = DataBuilder( alignment, alph, refidx, filter ) x = builder(alignment, refidx) colnames = builder.labels crossvalidator = CrossValidator( classifier_cls=Regressor, folds=OPTIONS.CV_FOLDS, classifier_kwargs=cvopts, scorer_cls=ContinuousPerfStats, scorer_kwargs={} ) results = crossvalidator.crossvalidate(x, y, classifier_kwargs={}, extra=extract_feature_weights) ret = cv_results_to_output(results, colnames) print(pretty_fmt_results(ret)) # mean_len = max([len('%.3f' % v.mu) for v in avg_stats.values()]) # std_len = max([len('%.3f' % v.sigma) for v in avg_stats.values()]) # std_len = int(log10(max([1.] + [v.sigma for v in avg_stats.values()]))) + 5 # for k, v in sorted(avg_stats.items(), key = lambda x: x[0][0]): # v_str = u'= %*.3f \xb1 %*.3f' % (mean_len, v.mu, std_len, v.sigma) # print(u' %s%s' % (k, v_str)) # # for k, v in avg_weights.items(): # if abs(v.mu) < 0.0001 and v.sigma == 0.: # del avg_weights[k] # # print('\nSignificant positions (top %d):' % (len(avg_weights))) # # if len(avg_weights) > 0: # name_len = max(len(k) for k in avg_weights.keys()) # mean_len = max(len('% .1f' % v.mu) for v in avg_weights.values()) # std_len = max(len('%.1f' % v.sigma) for v in avg_weights.values()) # N_len = max(len('%d' % len(v.values)) for v in avg_weights.values()) # for k, v in sorted(avg_weights.items(), key=lambda x: int(sub(r'[a-zA-Z\[\]]+', '', x[0]))): # print(u' %-*s % *.1f \xb1 %*.1f (N = %*d)' % (name_len, k, mean_len, v.mu, std_len, v.sigma, N_len, len(v.values))) # # print('\n') return 0
def main(argv=sys.argv): global OPTIONS # so some option parsing option_parser = setup_option_parser() (OPTIONS, args) = option_parser.parse_args(argv) # do some argument parsing if OPTIONS.TEST: run_tests() return 0 if OPTIONS.RAND_SEED is not None: seed(OPTIONS.RAND_SEED) if len(args) != 2: option_parser.error('ANTIBODY is a required argument') # check to make sure our mode is exclusive, and set the default (AMINO) if none is set if sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) > 1: option_parser.error( 'options --amino, --dna, and --stanfel are mutually exclusive') elif sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) == 0: OPTIONS.AMINO = True # validate the regression method cvopts = {} if OPTIONS.REGRESSOR_METHOD in regressor_classes: cvopts['regressorcls'] = regressor_classes[OPTIONS.REGRESSOR_METHOD] else: option_parser.error( '%s not in the list of available regression methods: \n %s' % (OPTIONS.REGRESSOR_METHOD, '\n '.join(regressor_classes.keys()))) if search(r'(?:lar|lasso)$', OPTIONS.REGRESSOR_METHOD): if OPTIONS.NUM_FEATURES < 0: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES cvopts['m'] = OPTIONS.NUM_FEATURES elif OPTIONS.NUM_FEATURES > 0: option_parser.error( '--numfeats is a useless parameter for regression method `%s\'' % OPTIONS.REGRESSOR_METHOD) cvopts['logspace'] = OPTIONS.LOGSPACE # validate the antibody argument, currently a hack exists to make PG9/PG16 work # TODO: Fix pg9/16 hax antibody = args[1].strip() valid_antibodies = sorted(OPTIONS.DATA.antibodies, key=lambda x: x.strip()) if antibody not in valid_antibodies: if ' ' + antibody not in valid_antibodies: option_parser.error( '%s not in the list of permitted antibodies: \n %s' % (antibody, '\n '.join([ab.strip() for ab in valid_antibodies]))) else: antibody = ' ' + antibody # validate the subtype option valid_subtypes = sorted(OPTIONS.DATA.subtypes, key=lambda x: x.strip().upper()) for subtype in OPTIONS.SUBTYPES: if subtype not in valid_subtypes: option_parser.error( '%s not in the list of permitted subtypes: \n %s' % (subtype, '\n '.join([st.strip() for st in valid_subtypes]))) if len(OPTIONS.FILTER) != 0: if OPTIONS.NUM_FEATURES != -1: option_parser.error( '--filter and --numfeats are incompatible options') else: OPTIONS.NUM_FEATURES = len(OPTIONS.FILTER) else: # len(OPTIONS.FILTER) == 0 if OPTIONS.NUM_FEATURES == -1: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES # destroy the parser because optparse docs recommend it option_parser.destroy() # use the default DNA HXB2 Reference seq if we define --dna but don't give a new default HXB2 Reference seq fix_hxb2_fasta() # set the util params set_util_params(OPTIONS.HXB2_IDS) # fetch the alphabet, we'll probably need it later alph = Alphabet(mode=Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet. DNA if OPTIONS.DNA else Alphabet.AMINO) ab_basename = ''.join((antibody, '_dna' if OPTIONS.DNA else '_amino', '_clonal' if OPTIONS.CLONAL else '')) alignment_basename = '_'.join( (ab_basename, OPTIONS.DATA.basename_root, __VERSION__)) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal = OPTIONS.DATA.seqrecords(antibody, OPTIONS.CLONAL, OPTIONS.DNA) # if clonal isn't supported, fallback to default if clonal != OPTIONS.CLONAL: ab_basename = ''.join(ab_basename.rsplit('_clonal', 1)) alignment_basename = ''.join(alignment_basename.rsplit('_clonal', 1)) sto_filename = alignment_basename + '.sto' alignment = generate_alignment(seqrecords, sto_filename, is_refidx, OPTIONS)[0] ylabeler = Labeler( seqrecord_get_values, lambda seq: is_HXB2(seq) or False, # TODO: again filtration function ) alignment, y, ic50gt = ylabeler(alignment) filter = naivefilter( OPTIONS.MAX_CONSERVATION, OPTIONS.MIN_CONSERVATION, OPTIONS.MAX_GAP_RATIO, ) refidx = alignment_identify_ref(alignment, is_HXB2) builder = DataBuilder(alignment, alph, refidx, filter) x = builder(alignment, refidx) colnames = builder.labels crossvalidator = CrossValidator(classifier_cls=Regressor, folds=OPTIONS.CV_FOLDS, classifier_kwargs=cvopts, scorer_cls=ContinuousPerfStats, scorer_kwargs={}) results = crossvalidator.crossvalidate(x, y, classifier_kwargs={}, extra=extract_feature_weights) ret = cv_results_to_output(results, colnames) print(pretty_fmt_results(ret)) # mean_len = max([len('%.3f' % v.mu) for v in avg_stats.values()]) # std_len = max([len('%.3f' % v.sigma) for v in avg_stats.values()]) # std_len = int(log10(max([1.] + [v.sigma for v in avg_stats.values()]))) + 5 # for k, v in sorted(avg_stats.items(), key = lambda x: x[0][0]): # v_str = u'= %*.3f \xb1 %*.3f' % (mean_len, v.mu, std_len, v.sigma) # print(u' %s%s' % (k, v_str)) # # for k, v in avg_weights.items(): # if abs(v.mu) < 0.0001 and v.sigma == 0.: # del avg_weights[k] # # print('\nSignificant positions (top %d):' % (len(avg_weights))) # # if len(avg_weights) > 0: # name_len = max(len(k) for k in avg_weights.keys()) # mean_len = max(len('% .1f' % v.mu) for v in avg_weights.values()) # std_len = max(len('%.1f' % v.sigma) for v in avg_weights.values()) # N_len = max(len('%d' % len(v.values)) for v in avg_weights.values()) # for k, v in sorted(avg_weights.items(), key=lambda x: int(sub(r'[a-zA-Z\[\]]+', '', x[0]))): # print(u' %-*s % *.1f \xb1 %*.1f (N = %*d)' % (name_len, k, mean_len, v.mu, std_len, v.sigma, N_len, len(v.values))) # # print('\n') return 0
def run_tests(): # set these to this so we don't exclude anything (just testing file generation and parsing) OPTIONS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 OPTIONS.MAXREL = False OPTIONS.DNA = False OPTIONS.MAX_CONSERVATION = 1.0 OPTIONS.MAX_GAP_RATIO = 1.0 OPTIONS.MIN_CONSERVATION = 1.0 # if we don't do this, DOOMBUNNIES set_util_params(OPTIONS.HXB2_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(_TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for OPTIONS.STANFEL in (True, False): if OPTIONS.STANFEL: OPTIONS.AMINO = False _TEST_NAMES = _TEST_STANFEL_NAMES _TEST_X = _TEST_STANFEL_X else: OPTIONS.AMINO = True _TEST_NAMES = _TEST_AMINO_NAMES _TEST_X = _TEST_AMINO_X alph = Alphabet(Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.DNA if OPTIONS.DNA else Alphabet.AMINO) # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda seq: is_HXB2(seq) or False, # TODO: again filtration function ) alignment, y, ic50gt = ylabeler(alignment) filter = naivefilter( OPTIONS.MAX_CONSERVATION, OPTIONS.MIN_CONSERVATION, OPTIONS.MAX_GAP_RATIO ) refidx = alignment_identify_ref(alignment, is_HXB2) builder = DataBuilder( alignment, alph, refidx, filter ) x = builder(alignment, refidx) colnames = builder.labels # test the feature names portion try: assert(len(colnames) == len(_TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, _TEST_NAMES)) for name in _TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(_TEST_X == x)) assert(np.all(_TEST_Y == y)) # TODO: generate and test the regressor data generation # print y, "\n", x finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)
def run_tests(): # set these to this so we don't exclude anything (just testing file generation and parsing) OPTIONS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 OPTIONS.MAXREL = False OPTIONS.DNA = False OPTIONS.MAX_CONSERVATION = 1.0 OPTIONS.MAX_GAP_RATIO = 1.0 OPTIONS.MIN_CONSERVATION = 1.0 # if we don't do this, DOOMBUNNIES set_util_params(OPTIONS.HXB2_IDS) fd, sto_filename = mkstemp() close(fd) try: fh = open(sto_filename, 'w') print(_TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for OPTIONS.STANFEL in (True, False): if OPTIONS.STANFEL: OPTIONS.AMINO = False _TEST_NAMES = _TEST_STANFEL_NAMES _TEST_X = _TEST_STANFEL_X else: OPTIONS.AMINO = True _TEST_NAMES = _TEST_AMINO_NAMES _TEST_X = _TEST_AMINO_X alph = Alphabet(Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet. DNA if OPTIONS.DNA else Alphabet.AMINO) # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda seq: is_HXB2(seq) or False, # TODO: again filtration function ) alignment, y, ic50gt = ylabeler(alignment) filter = naivefilter(OPTIONS.MAX_CONSERVATION, OPTIONS.MIN_CONSERVATION, OPTIONS.MAX_GAP_RATIO) refidx = alignment_identify_ref(alignment, is_HXB2) builder = DataBuilder(alignment, alph, refidx, filter) x = builder(alignment, refidx) colnames = builder.labels # test the feature names portion try: assert (len(colnames) == len(_TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, _TEST_NAMES)) for name in _TEST_NAMES: try: assert (name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert (np.all(_TEST_X == x)) assert (np.all(_TEST_Y == y)) # TODO: generate and test the regressor data generation # print y, "\n", x finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)