def main(): description = 'Build ML models to predict by-cellline drug response.' parser = get_parser(description) args = parser.parse_args() print('Args:', args, end='\n\n') print('Use percent growth for dose levels in log concentration range: [{}, {}]'.format(args.min_logconc, args.max_logconc)) print() cells = NCI60.all_cells() if 'all' in args.cells else args.cells for cell in cells: print('-' * 10, 'Cell line:', cell, '-' * 10) df = NCI60.load_by_cell_data(cell, drug_features=args.drug_features, scaling=args.scaling, min_logconc=args.min_logconc, max_logconc=args.max_logconc, subsample=args.subsample, feature_subsample=args.feature_subsample) if not df.shape[0]: print('No response data found\n') continue if args.classify: good_bins = summarize(df, args.cutoffs, min_count=args.cv) if good_bins < 2: print('Not enough classes\n') continue else: summarize(df) out = os.path.join(args.out_dir, cell) for model in args.models: if args.classify: classify(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out) else: regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
def main(): description = 'Build ML models to predict by-drug tumor response.' parser = get_parser(description) args = parser.parse_args() print('Args:', args, end='\n\n') if args.use_gi50: print('Use NCI GI50 value instead of percent growth') else: print('Use percent growth at log concentration: {}'.format(args.logconc)) drugs = args.drugs if 'all' in drugs: drugs = NCI60.all_drugs() elif len(drugs) == 1 and re.match("^[ABC]$", drugs[0].upper()): drugs = NCI60.drugs_in_set('Jason:' + drugs[0].upper()) print("Drugs in set '{}': {}".format(args.drugs[0], len(drugs))) print() for drug in drugs: print('-' * 10, 'Drug NSC:', drug, '-' * 10) df = NCI60.load_by_drug_data(drug, cell_features=args.cell_features, scaling=args.scaling, use_gi50=args.use_gi50, logconc=args.logconc, subsample=args.subsample, feature_subsample=args.feature_subsample) if not df.shape[0]: print('No response data found\n') continue if args.classify: cutoffs = None if args.autobins > 1 else args.cutoffs good_bins = summarize(df, cutoffs, autobins=args.autobins, min_count=args.cv) if good_bins < 2: print('Not enough classes\n') continue else: summarize(df) out = os.path.join(args.out_dir, 'NSC_' + drug) for model in args.models: if args.classify: classify(model, df, cv=args.cv, cutoffs=args.cutoffs, autobins=args.autobins, threads=args.threads, prefix=out) else: regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
self.label_statistics[label_function]["entities"][type_][ entity_string] += 1 # print("Found |{}| as |{}| using |{}|".format(entity_string, type_, label_function)) # Log relation statistics self.label_statistics[label_function]["relations_total"] += len( relations) if len(entities) > 1: self.label_statistics[label_function]["relation_candidates"] += 1 if relations: for relation in relations: self.label_statistics[label_function]["relations"][ relation["type"]] += 1 if __name__ == "__main__": parser = get_parser() args = parser.parse_args() supervisor = DistantSupervisor(data_path=args.data_path, ontology_path=args.ontology_path, output_path=args.output_path, timestamp_given=args.timestamp_given, cos_theta=args.cos_theta, filter_sentences=args.filter_sentences, token_pooling=args.token_pooling, mention_pooling=args.mention_pooling, entity_fraction=args.entity_fraction) supervisor.supervise(label_strategy=args.label_strategy, selection=tuple(args.selection))
def train(): parser = argparser.get_parser() args = parser.parse_args() device = -1 if args.gpu >= 0: device = args.gpu debug = args.debug print('input args:\n', json.dumps(vars(args), indent=4, separators=(',', ':'))) # pretty print args if args.data_name == 'qm9': from data import transform_qm9 transform_fn = transform_qm9.transform_fn atomic_num_list = [6, 7, 8, 9, 0] mlp_channels = [256, 256] gnn_channels = {'gcn': [8, 64], 'hidden': [128, 64]} valid_idx = transform_qm9.get_val_ids() elif args.data_name == 'zinc250k': transform_fn = transform_fn_zinc250k atomic_num_list = zinc250_atomic_num_list mlp_channels = [1024, 512] gnn_channels = {'gcn': [16, 128], 'hidden': [256, 64]} valid_idx = transform_zinc250k.get_val_ids() dataset = NumpyTupleDataset.load( os.path.join(args.data_dir, args.data_file)) dataset = TransformDataset(dataset, transform_fn) if len(valid_idx) > 0: train_idx = [t for t in range(len(dataset)) if t not in valid_idx] n_train = len(train_idx) train_idx.extend(valid_idx) train, test = chainer.datasets.split_dataset(dataset, n_train, train_idx) else: train, test = chainer.datasets.split_dataset_random( dataset, int(len(dataset) * 0.8), seed=args.seed) train_iter = chainer.iterators.SerialIterator(train, args.batch_size) num_masks = { 'node': args.num_node_masks, 'channel': args.num_channel_masks } mask_size = { 'node': args.node_mask_size, 'channel': args.channel_mask_size } num_coupling = { 'node': args.num_node_coupling, 'channel': args.num_channel_coupling } model_params = Hyperparameters( args.num_atoms, args.num_rels, len(atomic_num_list), num_masks=num_masks, mask_size=mask_size, num_coupling=num_coupling, batch_norm=args.apply_batch_norm, additive_transformations=args.additive_transformations, learn_dist=args.learn_dist, mlp_channels=mlp_channels, gnn_channels=gnn_channels) model = GraphNvpModel(model_params) if device >= 0: chainer.cuda.get_device(device).use() model.to_gpu(device) print('==========================================') if device >= 0: print('Using GPUs') print('Num Minibatch-size: {}'.format(args.batch_size)) print('Num epoch: {}'.format(args.max_epochs)) print('==========================================') os.makedirs(args.save_dir, exist_ok=True) model.save_hyperparams(os.path.join(args.save_dir, 'graphnvp-params.json')) opt = chainer.optimizers.Adam() opt.setup(model) updater = MolNvpUpdater(train_iter, opt, device=device, loss_func=None) trainer = training.Trainer(updater, (args.max_epochs, 'epoch'), out=args.save_dir) # trainer.extend(extensions.dump_graph('log_likelihood')) def print_validity(t): adj, x = generate_mols(model, batch_size=100, gpu=device) valid_mols = check_validity(adj, x, atomic_num_list, device)['valid_mols'] mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.epoch)) # mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.iteration)) os.makedirs(mol_dir, exist_ok=True) for ind, mol in enumerate(valid_mols): save_mol_png(mol, os.path.join(mol_dir, '{}.png'.format(ind))) if debug: # trainer.extend(print_validity, trigger=(1, 'epoch')) trainer.extend(print_validity, trigger=(100, 'iteration')) save_epochs = args.save_epochs if save_epochs == -1: save_epochs = args.max_epochs trainer.extend(extensions.snapshot(), trigger=(save_epochs, 'epoch')) # trainer.extend(extensions.PlotReport(['log_likelihood'], 'epoch', file_name='qm9.png'), # trigger=(100, 'iteration')) trainer.extend( extensions.PrintReport( ['epoch', 'log_likelihood', 'nll_x', 'nll_adj', 'elapsed_time'])) trainer.extend(extensions.LogReport()) trainer.extend(extensions.ProgressBar()) if args.load_params == 1: chainer.serializers.load_npz(args.load_snapshot, trainer) trainer.run() chainer.serializers.save_npz( os.path.join(args.save_dir, 'graph-nvp-final.npz'), model)
class Settings(CaseClass): """ Manages all settings """ parser = argparser.get_parser() def __init__(self, operation=None, options=None, repo=None): super(Settings, self).__init__(['operation', 'options', 'repo']) # fallback operation operation = operation or HelpOperation(self.parser) # set parameters self.operation = operation self.options = options self.repo = repo def configure_logging(self): """ Setup logging settings """ level = self.options['log_level'] if self.options else logging.INFO logging.basicConfig(level=level, format='[%(levelname)s] %(message)s') return self def parse_args(self, argv): """ Parse command line arguments and update operation and options """ options, args = Settings.parser.parse_args(argv[1:]) if len(args) < 2: return self opt_dict = vars(options) try: operation = op.make(args[0], args[1], args[2:], opt_dict) except AssertionError: return self return Settings(operation, opt_dict, self.repo) def load_environ(self, env): """ Load environment variables """ keys = [ ('access_key', 'AWS_ACCESS_KEY_ID'), ('secret_key', 'AWS_SECRET_ACCESS_KEY'), ('region', 'AWS_DEFAULT_REGION'), ] d = {} if self.options is None else self.options.copy() updates = dict([(k, env[v]) for k, v in keys if k not in d and v in env]) if updates: d.update(updates) return Settings(self.operation, d, self.repo) else: return self def load_config(self): """ Load configuration file and set repo """ if self.options is None: return Settings() group_id = self.operation.group_id if not group_id: logging.error('Failed to load config: group id is not set in operation') return Settings() access_key = self.options['access_key'] secret_key = self.options['secret_key'] bucket = self.options['bucket'] config = self.options['config'] region = self.options['region'] if not all([access_key, secret_key, bucket]): # command line arguments are prior to the configuration file path = expandvars(expanduser(config)) try: with open(path) as fp: a, s, b, r = self._read_aws_config(fp, group_id) access_key = a if access_key is None else access_key secret_key = s if secret_key is None else secret_key bucket = b if bucket is None else bucket region = r if region is None else region except IOError: logging.error('Failed to open configuration file: %s' % config) return Settings() for x, arg, opt in [ (access_key, 'access_key', '--access'), (secret_key, 'secret_key', '--secret'), (bucket, 'bucket', '--bucket'), ]: if x is None: logging.error('Oops! "%s" setting is missing.' % arg) logging.error('Use "%s" option or write configuration file: %s' % (opt, config)) return Settings() # set repository driver driver = S3Driver(access_key, secret_key, bucket, group_id, region) repo = Repository(driver, group_id) return Settings(self.operation, self.options, repo) @classmethod def _read_aws_config(cls, fp, group_id): import ConfigParser parser = ConfigParser.SafeConfigParser() parser.readfp(fp) # use group id as section name section_name = group_id if parser.has_section(group_id) else 'default' def f(attr): return parser.get(section_name, attr) if parser.has_option(section_name, attr) else None access_key = f('aws_access_key_id') secret_key = f('aws_secret_access_key') bucket = f('bucket') region = f('region') return access_key, secret_key, bucket, region
# output final samples output_samples(model, args, eval_generator) def generate_main(args): # determine input image size args.max_height, args.max_width = output_size_from_glob( args.convert_glob, width=args.max_width) print('creating model...') model, input_generator, eval_generator = get_model_by_name(args.model) model = model(args) print('loading weights...') weights_filename = args.weights_prefix + '.weights' if not args.ignore_weights and os.path.exists(weights_filename): model.nodes['texnet'].load_weights(weights_filename) transform_glob(model, args) if __name__ == '__main__': import argparser import os args = argparser.get_parser() output_dir = os.path.dirname(args.output_prefix) if not os.path.exists(output_dir): os.makedirs(output_dir) main(args)
from __future__ import print_function # When running on remotely import matplotlib matplotlib.use('Agg') import os import sys from datasets import NCI60 from argparser import get_parser description = 'Save dataframes to CSV files.' parser = get_parser(description) parser.add_argument("--by", default='drug', choices=['cell', 'drug'], help='generate dataframes for by cell or by drug problems') parser.add_argument("--float_format", default='%.4g', help='csv float format') # https://docs.python.org/3.6/library/argparse.html#argparse.ArgumentParser.parse_args # (ap) args = parser.parse_args() sys.stdout = sys.stderr # print('Args:', args, end='\n\n') if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) if args.by == 'cell':
model_file.write(model_json) # output final samples output_samples(model, args, eval_generator) def generate_main(args): # determine input image size args.max_height, args.max_width = output_size_from_glob(args.convert_glob, width=args.max_width) print('creating model...') model, input_generator, eval_generator = get_model_by_name(args.model) model = model(args) print('loading weights...') weights_filename = args.weights_prefix + '.weights' if not args.ignore_weights and os.path.exists(weights_filename): model.nodes['texnet'].load_weights(weights_filename) transform_glob(model, args) if __name__ == '__main__': import argparser import os args = argparser.get_parser() output_dir = os.path.dirname(args.output_prefix) if not os.path.exists(output_dir): os.makedirs(output_dir) main(args)