Пример #1
0
def main():
    description = 'Build ML models to predict by-cellline drug response.'
    parser = get_parser(description)
    args = parser.parse_args()

    print('Args:', args, end='\n\n')
    print('Use percent growth for dose levels in log concentration range: [{}, {}]'.format(args.min_logconc, args.max_logconc))
    print()

    cells = NCI60.all_cells() if 'all' in args.cells else args.cells

    for cell in cells:
        print('-' * 10, 'Cell line:', cell, '-' * 10)
        df = NCI60.load_by_cell_data(cell, drug_features=args.drug_features, scaling=args.scaling,
                                     min_logconc=args.min_logconc, max_logconc=args.max_logconc,
                                     subsample=args.subsample, feature_subsample=args.feature_subsample)
        if not df.shape[0]:
            print('No response data found\n')
            continue

        if args.classify:
            good_bins = summarize(df, args.cutoffs, min_count=args.cv)
            if good_bins < 2:
                print('Not enough classes\n')
                continue
        else:
            summarize(df)

        out = os.path.join(args.out_dir, cell)
        for model in args.models:
            if args.classify:
                classify(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
            else:
                regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
Пример #2
0
def main():
    description = 'Build ML models to predict by-drug tumor response.'
    parser = get_parser(description)
    args = parser.parse_args()

    print('Args:', args, end='\n\n')
    if args.use_gi50:
        print('Use NCI GI50 value instead of percent growth')
    else:
        print('Use percent growth at log concentration: {}'.format(args.logconc))

    drugs = args.drugs
    if 'all' in drugs:
        drugs = NCI60.all_drugs()
    elif len(drugs) == 1 and re.match("^[ABC]$", drugs[0].upper()):
        drugs = NCI60.drugs_in_set('Jason:' + drugs[0].upper())
        print("Drugs in set '{}': {}".format(args.drugs[0], len(drugs)))

    print()
    for drug in drugs:
        print('-' * 10, 'Drug NSC:', drug, '-' * 10)
        df = NCI60.load_by_drug_data(drug, cell_features=args.cell_features, scaling=args.scaling,
                                     use_gi50=args.use_gi50, logconc=args.logconc,
                                     subsample=args.subsample, feature_subsample=args.feature_subsample)
        if not df.shape[0]:
            print('No response data found\n')
            continue

        if args.classify:
            cutoffs = None if args.autobins > 1 else args.cutoffs
            good_bins = summarize(df, cutoffs, autobins=args.autobins, min_count=args.cv)
            if good_bins < 2:
                print('Not enough classes\n')
                continue
        else:
            summarize(df)

        out = os.path.join(args.out_dir, 'NSC_' + drug)
        for model in args.models:
            if args.classify:
                classify(model, df, cv=args.cv, cutoffs=args.cutoffs, autobins=args.autobins, threads=args.threads, prefix=out)
            else:
                regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
                self.label_statistics[label_function]["entities"][type_][
                    entity_string] += 1
                # print("Found |{}| as |{}| using |{}|".format(entity_string, type_, label_function))

        # Log relation statistics
        self.label_statistics[label_function]["relations_total"] += len(
            relations)
        if len(entities) > 1:
            self.label_statistics[label_function]["relation_candidates"] += 1
        if relations:
            for relation in relations:
                self.label_statistics[label_function]["relations"][
                    relation["type"]] += 1


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    supervisor = DistantSupervisor(data_path=args.data_path,
                                   ontology_path=args.ontology_path,
                                   output_path=args.output_path,
                                   timestamp_given=args.timestamp_given,
                                   cos_theta=args.cos_theta,
                                   filter_sentences=args.filter_sentences,
                                   token_pooling=args.token_pooling,
                                   mention_pooling=args.mention_pooling,
                                   entity_fraction=args.entity_fraction)

    supervisor.supervise(label_strategy=args.label_strategy,
                         selection=tuple(args.selection))
Пример #4
0
def train():
    parser = argparser.get_parser()
    args = parser.parse_args()

    device = -1
    if args.gpu >= 0:
        device = args.gpu
    debug = args.debug
    print('input args:\n',
          json.dumps(vars(args), indent=4,
                     separators=(',', ':')))  # pretty print args

    if args.data_name == 'qm9':
        from data import transform_qm9
        transform_fn = transform_qm9.transform_fn
        atomic_num_list = [6, 7, 8, 9, 0]
        mlp_channels = [256, 256]
        gnn_channels = {'gcn': [8, 64], 'hidden': [128, 64]}
        valid_idx = transform_qm9.get_val_ids()
    elif args.data_name == 'zinc250k':
        transform_fn = transform_fn_zinc250k
        atomic_num_list = zinc250_atomic_num_list
        mlp_channels = [1024, 512]
        gnn_channels = {'gcn': [16, 128], 'hidden': [256, 64]}
        valid_idx = transform_zinc250k.get_val_ids()

    dataset = NumpyTupleDataset.load(
        os.path.join(args.data_dir, args.data_file))
    dataset = TransformDataset(dataset, transform_fn)

    if len(valid_idx) > 0:
        train_idx = [t for t in range(len(dataset)) if t not in valid_idx]
        n_train = len(train_idx)
        train_idx.extend(valid_idx)
        train, test = chainer.datasets.split_dataset(dataset, n_train,
                                                     train_idx)
    else:
        train, test = chainer.datasets.split_dataset_random(
            dataset, int(len(dataset) * 0.8), seed=args.seed)

    train_iter = chainer.iterators.SerialIterator(train, args.batch_size)
    num_masks = {
        'node': args.num_node_masks,
        'channel': args.num_channel_masks
    }
    mask_size = {
        'node': args.node_mask_size,
        'channel': args.channel_mask_size
    }
    num_coupling = {
        'node': args.num_node_coupling,
        'channel': args.num_channel_coupling
    }
    model_params = Hyperparameters(
        args.num_atoms,
        args.num_rels,
        len(atomic_num_list),
        num_masks=num_masks,
        mask_size=mask_size,
        num_coupling=num_coupling,
        batch_norm=args.apply_batch_norm,
        additive_transformations=args.additive_transformations,
        learn_dist=args.learn_dist,
        mlp_channels=mlp_channels,
        gnn_channels=gnn_channels)

    model = GraphNvpModel(model_params)

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu(device)

    print('==========================================')
    if device >= 0:
        print('Using GPUs')
    print('Num Minibatch-size: {}'.format(args.batch_size))
    print('Num epoch: {}'.format(args.max_epochs))
    print('==========================================')
    os.makedirs(args.save_dir, exist_ok=True)
    model.save_hyperparams(os.path.join(args.save_dir, 'graphnvp-params.json'))

    opt = chainer.optimizers.Adam()
    opt.setup(model)
    updater = MolNvpUpdater(train_iter, opt, device=device, loss_func=None)
    trainer = training.Trainer(updater, (args.max_epochs, 'epoch'),
                               out=args.save_dir)

    # trainer.extend(extensions.dump_graph('log_likelihood'))

    def print_validity(t):
        adj, x = generate_mols(model, batch_size=100, gpu=device)
        valid_mols = check_validity(adj, x, atomic_num_list,
                                    device)['valid_mols']
        mol_dir = os.path.join(args.save_dir,
                               'generated_{}'.format(t.updater.epoch))
        # mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.iteration))
        os.makedirs(mol_dir, exist_ok=True)
        for ind, mol in enumerate(valid_mols):
            save_mol_png(mol, os.path.join(mol_dir, '{}.png'.format(ind)))

    if debug:
        # trainer.extend(print_validity, trigger=(1, 'epoch'))
        trainer.extend(print_validity, trigger=(100, 'iteration'))
    save_epochs = args.save_epochs
    if save_epochs == -1:
        save_epochs = args.max_epochs

    trainer.extend(extensions.snapshot(), trigger=(save_epochs, 'epoch'))
    # trainer.extend(extensions.PlotReport(['log_likelihood'], 'epoch', file_name='qm9.png'),
    #                trigger=(100, 'iteration'))
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'log_likelihood', 'nll_x', 'nll_adj', 'elapsed_time']))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.ProgressBar())
    if args.load_params == 1:
        chainer.serializers.load_npz(args.load_snapshot, trainer)
    trainer.run()
    chainer.serializers.save_npz(
        os.path.join(args.save_dir, 'graph-nvp-final.npz'), model)
Пример #5
0
class Settings(CaseClass):
    """
    Manages all settings
    """
    parser = argparser.get_parser()

    def __init__(self, operation=None, options=None, repo=None):
        super(Settings, self).__init__(['operation', 'options', 'repo'])

        # fallback operation
        operation = operation or HelpOperation(self.parser)

        # set parameters
        self.operation = operation
        self.options = options
        self.repo = repo

    def configure_logging(self):
        """
        Setup logging settings
        """
        level = self.options['log_level'] if self.options else logging.INFO
        logging.basicConfig(level=level, format='[%(levelname)s] %(message)s')
        return self

    def parse_args(self, argv):
        """
        Parse command line arguments and update operation and options
        """
        options, args = Settings.parser.parse_args(argv[1:])
        if len(args) < 2:
            return self

        opt_dict = vars(options)
        try:
            operation = op.make(args[0], args[1], args[2:], opt_dict)
        except AssertionError:
            return self

        return Settings(operation, opt_dict, self.repo)

    def load_environ(self, env):
        """
        Load environment variables
        """
        keys = [
            ('access_key', 'AWS_ACCESS_KEY_ID'),
            ('secret_key', 'AWS_SECRET_ACCESS_KEY'),
            ('region', 'AWS_DEFAULT_REGION'),
        ]
        d = {} if self.options is None else self.options.copy()
        updates = dict([(k, env[v]) for k, v in keys if k not in d and v in env])
        if updates:
            d.update(updates)
            return Settings(self.operation, d, self.repo)
        else:
            return self

    def load_config(self):
        """
        Load configuration file and set repo
        """
        if self.options is None:
            return Settings()

        group_id = self.operation.group_id
        if not group_id:
            logging.error('Failed to load config: group id is not set in operation')
            return Settings()

        access_key = self.options['access_key']
        secret_key = self.options['secret_key']
        bucket = self.options['bucket']
        config = self.options['config']
        region = self.options['region']

        if not all([access_key, secret_key, bucket]):
            # command line arguments are prior to the configuration file
            path = expandvars(expanduser(config))

            try:
                with open(path) as fp:
                    a, s, b, r = self._read_aws_config(fp, group_id)
                    access_key = a if access_key is None else access_key
                    secret_key = s if secret_key is None else secret_key
                    bucket = b if bucket is None else bucket
                    region = r if region is None else region
            except IOError:
                logging.error('Failed to open configuration file: %s' % config)
                return Settings()

        for x, arg, opt in [
            (access_key, 'access_key', '--access'),
            (secret_key, 'secret_key', '--secret'),
            (bucket, 'bucket', '--bucket'),
        ]:
            if x is None:
                logging.error('Oops! "%s" setting is missing.' % arg)
                logging.error('Use "%s" option or write configuration file: %s' % (opt, config))
                return Settings()

        # set repository driver
        driver = S3Driver(access_key, secret_key, bucket, group_id, region)
        repo = Repository(driver, group_id)
        return Settings(self.operation, self.options, repo)

    @classmethod
    def _read_aws_config(cls, fp, group_id):
        import ConfigParser

        parser = ConfigParser.SafeConfigParser()
        parser.readfp(fp)

        # use group id as section name
        section_name = group_id if parser.has_section(group_id) else 'default'

        def f(attr):
            return parser.get(section_name, attr) if parser.has_option(section_name, attr) else None

        access_key = f('aws_access_key_id')
        secret_key = f('aws_secret_access_key')
        bucket = f('bucket')
        region = f('region')

        return access_key, secret_key, bucket, region
Пример #6
0
    # output final samples
    output_samples(model, args, eval_generator)


def generate_main(args):
    # determine input image size
    args.max_height, args.max_width = output_size_from_glob(
        args.convert_glob, width=args.max_width)
    print('creating model...')
    model, input_generator, eval_generator = get_model_by_name(args.model)
    model = model(args)
    print('loading weights...')
    weights_filename = args.weights_prefix + '.weights'
    if not args.ignore_weights and os.path.exists(weights_filename):
        model.nodes['texnet'].load_weights(weights_filename)

    transform_glob(model, args)


if __name__ == '__main__':
    import argparser
    import os

    args = argparser.get_parser()

    output_dir = os.path.dirname(args.output_prefix)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    main(args)
Пример #7
0
from __future__ import print_function

# When running on remotely
import matplotlib
matplotlib.use('Agg')

import os
import sys

from datasets import NCI60
from argparser import get_parser

description = 'Save dataframes to CSV files.'
parser = get_parser(description)

parser.add_argument("--by",
                    default='drug',
                    choices=['cell', 'drug'],
                    help='generate dataframes for by cell or by drug problems')
parser.add_argument("--float_format", default='%.4g', help='csv float format')

# https://docs.python.org/3.6/library/argparse.html#argparse.ArgumentParser.parse_args  # (ap)
args = parser.parse_args()

sys.stdout = sys.stderr
# print('Args:', args, end='\n\n')

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

if args.by == 'cell':
Пример #8
0
            model_file.write(model_json)
    # output final samples
    output_samples(model, args, eval_generator)


def generate_main(args):
    # determine input image size
    args.max_height, args.max_width = output_size_from_glob(args.convert_glob, width=args.max_width)
    print('creating model...')
    model, input_generator, eval_generator = get_model_by_name(args.model)
    model = model(args)
    print('loading weights...')
    weights_filename = args.weights_prefix + '.weights'
    if not args.ignore_weights and os.path.exists(weights_filename):
        model.nodes['texnet'].load_weights(weights_filename)

    transform_glob(model, args)


if __name__ == '__main__':
    import argparser
    import os

    args = argparser.get_parser()

    output_dir = os.path.dirname(args.output_prefix)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    main(args)