示例#1
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        self.opts = opts
        self.log = log

        # Get performance curve functions from names.
        curve_funs = dict()
        if opts.curves:
            for name in opts.curves:
                curve_funs[name] = get_curve_fun(name)
        anno_curve_funs = dict()
        if opts.anno_curves:
            for name in opts.anno_curves:
                anno_curve_funs[name] = get_curve_fun(name)

        log.info('Loading data ...')
        # Read and sort predictions and outputs.
        output_names = dat.get_output_names(opts.data_file,
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        names = {'chromo': None, 'pos': None,
                 'outputs': output_names,
                 'preds': output_names}
        data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample)
        data['chromo'] = [chromo.decode() for chromo in data['chromo']]
        data['chromo'] = np.array(data['chromo'])
        data = fold_dict(data, nb_level=1)
        idx = np.lexsort((data['pos'], data['chromo']))
        data = slice_dict(data, idx)
        for chromo in np.unique(data['chromo']):
            chromo_pos = data['pos'][data['chromo'] == chromo]
            tmp = np.sort(chromo_pos)
            assert np.all(chromo_pos == tmp)
        log.info('%d samples' % len(data['pos']))

        reports = []
        curves = []

        log.info('Evaluating globally ...')
        # Evaluate performances globally.
        report = ev.evaluate_outputs(data['outputs'], data['preds'])
        report['anno'] = ANNO_GLOBAL
        reports.append(report)
        pd.set_option('display.width', 1000)
        print(ev.unstack_report(report))

        if curve_funs:
            # Performance curves.
            for name, fun in curve_funs.items():
                log.info('%s curve' % name)
                curve = ev.evaluate_curve(data['outputs'], data['preds'],
                                          fun=fun, nb_point=opts.nb_curve_point)
                if curve is not None:
                    curve['curve'] = name
                    curve['anno'] = ANNO_GLOBAL
                    curves.append(curve)

        if opts.anno_files:
            log.info('Evaluating annotations ...')
            # Evaluate annotations.
            for anno_file in opts.anno_files:
                anno = read_anno_file(anno_file)
                anno_name = os.path.splitext(os.path.basename(anno_file))[0]
                idx = annotate(data['chromo'], data['pos'], anno)
                log.info('%s: %d' % (anno_name, idx.sum()))
                if idx.sum() < opts.anno_min_sites:
                    log.info('Skipping due to insufficient annotated sites!')
                    continue
                # Select data at annotated sites.
                anno_data = slice_dict(data, idx)
                report = ev.evaluate_outputs(anno_data['outputs'],
                                             anno_data['preds'])
                report['anno'] = anno_name
                reports.append(report)

                if curve_funs:
                    # Performance curves.
                    for name, fun in anno_curve_funs.items():
                        log.info('%s curve' % name)
                        curve = ev.evaluate_curve(
                            data['outputs'], data['preds'],
                            fun=fun, nb_point=opts.nb_curve_point)
                        if curve is not None:
                            curve['curve'] = name
                            curve['anno'] = anno_name
                            curves.append(curve)

        make_dir(opts.out_dir)
        if reports:
            report = pd.concat(reports)
            report = report[['anno', 'metric', 'output', 'value']]
            self.save_report(report, 'metrics')
        if curves:
            curves = pd.concat(curves)
            curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']]
            self.save_report(curves, 'curves')

        log.info('Done!')

        return 0
示例#2
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        model = mod.load_model(opts.model_files)

        log.info('Loading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(opts.data_files[0],
                                                  regex=opts.replicate_names,
                                                  nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, replicate_names, replicate_names=replicate_names)

        # Seed used since unobserved input CpG states are randomly sampled
        if opts.seed is not None:
            np.random.seed(opts.seed)
            random.seed(opts.seed)

        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        writer = None
        if opts.out_data:
            writer = H5Writer(opts.out_data, nb_sample)

        log.info('Predicting ...')
        nb_tot = 0
        nb_eval = 0
        data_eval = dict()
        perf_eval = []
        progbar = ProgressBar(nb_sample, log.info)
        for inputs, outputs, weights in data_reader:
            batch_size = len(list(inputs.values())[0])
            nb_tot += batch_size
            progbar.update(batch_size)

            preds = to_list(model.predict(inputs))

            data_batch = dict()
            data_batch['preds'] = dict()
            data_batch['outputs'] = dict()
            for i, name in enumerate(model.output_names):
                data_batch['preds'][name] = preds[i].squeeze()
                data_batch['outputs'][name] = outputs[name].squeeze()

            for name, value in six.iteritems(next(meta_reader)):
                data_batch[name] = value

            if writer:
                writer.write_dict(data_batch)

            nb_eval += batch_size
            dat.add_to_dict(data_batch, data_eval)

            if nb_tot >= nb_sample or \
                    (opts.eval_size and nb_eval >= opts.eval_size):
                data_eval = dat.stack_dict(data_eval)
                perf_eval.append(
                    ev.evaluate_outputs(data_eval['outputs'],
                                        data_eval['preds']))
                data_eval = dict()
                nb_eval = 0

        progbar.close()
        if writer:
            writer.close()

        report = pd.concat(perf_eval)
        report = report.groupby(['metric', 'output']).mean().reset_index()

        if opts.out_report:
            report.to_csv(opts.out_report, sep='\t', index=False)

        report = ev.unstack_report(report)
        print(report.to_string())

        log.info('Done!')

        return 0
示例#3
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        model = mod.load_model(opts.model_files)

        log.info('Loading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(
            opts.data_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, replicate_names, replicate_names=replicate_names)

        # Seed used since unobserved input CpG states are randomly sampled
        if opts.seed is not None:
            np.random.seed(opts.seed)
            random.seed(opts.seed)

        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False, shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False, shuffle=False)

        writer = None
        if opts.out_data:
            writer = H5Writer(opts.out_data, nb_sample)

        log.info('Predicting ...')
        nb_tot = 0
        nb_eval = 0
        data_eval = dict()
        perf_eval = []
        progbar = ProgressBar(nb_sample, log.info)
        for inputs, outputs, weights in data_reader:
            batch_size = len(list(inputs.values())[0])
            nb_tot += batch_size
            progbar.update(batch_size)

            preds = to_list(model.predict(inputs))

            data_batch = dict()
            data_batch['preds'] = dict()
            data_batch['outputs'] = dict()
            for i, name in enumerate(model.output_names):
                data_batch['preds'][name] = preds[i].squeeze()
                data_batch['outputs'][name] = outputs[name].squeeze()

            for name, value in six.iteritems(next(meta_reader)):
                data_batch[name] = value

            if writer:
                writer.write_dict(data_batch)

            nb_eval += batch_size
            dat.add_to_dict(data_batch, data_eval)

            if nb_tot >= nb_sample or \
                    (opts.eval_size and nb_eval >= opts.eval_size):
                data_eval = dat.stack_dict(data_eval)
                perf_eval.append(ev.evaluate_outputs(data_eval['outputs'],
                                                     data_eval['preds']))
                data_eval = dict()
                nb_eval = 0

        progbar.close()
        if writer:
            writer.close()

        report = pd.concat(perf_eval)
        report = report.groupby(['metric', 'output']).mean().reset_index()

        if opts.out_report:
            report.to_csv(opts.out_report, sep='\t', index=False)

        report = ev.unstack_report(report)
        print(report.to_string())

        log.info('Done!')

        return 0
示例#4
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        model = mod.load_model(opts.model_files)

        log.info('Loading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(
            opts.data_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, replicate_names, replicate_names=replicate_names)

        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False, shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False, shuffle=False)

        log.info('Predicting ...')
        data = dict()
        progbar = ProgressBar(nb_sample, log.info)
        for inputs, outputs, weights in data_reader:
            batch_size = len(list(inputs.values())[0])
            progbar.update(batch_size)

            preds = to_list(model.predict(inputs))

            data_batch = dict()
            data_batch['preds'] = dict()
            data_batch['outputs'] = dict()
            for i, name in enumerate(model.output_names):
                data_batch['preds'][name] = preds[i].squeeze()
                data_batch['outputs'][name] = outputs[name].squeeze()

            for name, value in next(meta_reader).items():
                data_batch[name] = value
            dat.add_to_dict(data_batch, data)
        progbar.close()
        data = dat.stack_dict(data)

        report = ev.evaluate_outputs(data['outputs'], data['preds'])

        if opts.out_report:
            report.to_csv(opts.out_report, sep='\t', index=False)

        report = ev.unstack_report(report)
        print(report.to_string())

        if opts.out_data:
            hdf.write_data(data, opts.out_data)

        log.info('Done!')

        return 0
示例#5
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        self.opts = opts
        self.log = log

        # Get performance curve functions from names.
        curve_funs = dict()
        if opts.curves:
            for name in opts.curves:
                curve_funs[name] = get_curve_fun(name)
        anno_curve_funs = dict()
        if opts.anno_curves:
            for name in opts.anno_curves:
                anno_curve_funs[name] = get_curve_fun(name)

        log.info('Loading data ...')
        # Read and sort predictions and outputs.
        output_names = dat.get_output_names(opts.data_file,
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        names = {
            'chromo': None,
            'pos': None,
            'outputs': output_names,
            'preds': output_names
        }
        data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample)
        data['chromo'] = [chromo.decode() for chromo in data['chromo']]
        data['chromo'] = np.array(data['chromo'])
        data = fold_dict(data, nb_level=1)
        idx = np.lexsort((data['pos'], data['chromo']))
        data = slice_dict(data, idx)
        for chromo in np.unique(data['chromo']):
            chromo_pos = data['pos'][data['chromo'] == chromo]
            tmp = np.sort(chromo_pos)
            assert np.all(chromo_pos == tmp)
        log.info('%d samples' % len(data['pos']))

        reports = []
        curves = []

        log.info('Evaluating globally ...')
        # Evaluate performances globally.
        report = ev.evaluate_outputs(data['outputs'], data['preds'])
        report['anno'] = ANNO_GLOBAL
        reports.append(report)
        pd.set_option('display.width', 1000)
        print(ev.unstack_report(report))

        if curve_funs:
            # Performance curves.
            for name, fun in curve_funs.items():
                log.info('%s curve' % name)
                curve = ev.evaluate_curve(data['outputs'],
                                          data['preds'],
                                          fun=fun,
                                          nb_point=opts.nb_curve_point)
                if curve is not None:
                    curve['curve'] = name
                    curve['anno'] = ANNO_GLOBAL
                    curves.append(curve)

        if opts.anno_files:
            log.info('Evaluating annotations ...')
            # Evaluate annotations.
            for anno_file in opts.anno_files:
                anno = read_anno_file(anno_file)
                anno_name = os.path.splitext(os.path.basename(anno_file))[0]
                idx = annotate(data['chromo'], data['pos'], anno)
                log.info('%s: %d' % (anno_name, idx.sum()))
                if idx.sum() < opts.anno_min_sites:
                    log.info('Skipping due to insufficient annotated sites!')
                    continue
                # Select data at annotated sites.
                anno_data = slice_dict(data, idx)
                report = ev.evaluate_outputs(anno_data['outputs'],
                                             anno_data['preds'])
                report['anno'] = anno_name
                reports.append(report)

                if curve_funs:
                    # Performance curves.
                    for name, fun in anno_curve_funs.items():
                        log.info('%s curve' % name)
                        curve = ev.evaluate_curve(data['outputs'],
                                                  data['preds'],
                                                  fun=fun,
                                                  nb_point=opts.nb_curve_point)
                        if curve is not None:
                            curve['curve'] = name
                            curve['anno'] = anno_name
                            curves.append(curve)

        make_dir(opts.out_dir)
        if reports:
            report = pd.concat(reports)
            report = report[['anno', 'metric', 'output', 'value']]
            self.save_report(report, 'metrics')
        if curves:
            curves = pd.concat(curves)
            curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']]
            self.save_report(curves, 'curves')

        log.info('Done!')

        return 0