def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) output_names = dat.get_output_names(opts.data_files[0], regex=opts.output_names) stats = OrderedDict() for name in output_names: output = hdf.read(opts.data_files, 'outputs/%s' % name, nb_sample=opts.nb_sample) output = list(output.values())[0] stats[name] = get_output_stats(output) tmp = [] for key, value in six.iteritems(stats): tmp.append(pd.DataFrame(value, index=[key])) stats = pd.concat(tmp) stats.index.name = 'output' stats.reset_index(inplace=True) print(stats.to_string()) if opts.out_tsv: stats.to_csv(opts.out_tsv, sep='\t', index=False) if opts.out_fig: plot_stats(stats).savefig(opts.out_fig) return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) output_names = dat.get_output_names(opts.data_files[0], regex=opts.output_names) stats = OrderedDict() for name in output_names: output = hdf.read(opts.data_files, 'outputs/%s' % name, nb_sample=opts.nb_sample) output = list(output.values())[0] stats[name] = get_output_stats(output) tmp = [] for key, value in six.iteritems(stats): tmp.append(pd.DataFrame(value, index=[key])) stats = pd.concat(tmp) stats.index.name = 'output' stats.reset_index(inplace=True) print(stats.to_string()) if opts.out_tsv: stats.to_csv(opts.out_tsv, sep='\t', index=False) if opts.out_fig: plot_stats(stats).savefig(opts.out_fig) return 0
def build_model(self): opts = self.opts log = self.log output_names = dat.get_output_names(opts.train_files[0], regex=opts.output_names, nb_key=opts.nb_output) if not output_names: raise ValueError('No outputs found!') dna_model = None if opts.dna_model: dna_model = self.build_dna_model() cpg_model = None if opts.cpg_model: cpg_model = self.build_cpg_model() if dna_model is not None and cpg_model is not None: log.info('Joining models ...') joint_model_builder = mod.joint.get(opts.joint_model)( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) stem = joint_model_builder([dna_model, cpg_model]) stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name]) elif dna_model is not None: stem = dna_model elif cpg_model is not None: stem = cpg_model else: log.info('Loading existing model ...') stem = mod.load_model(opts.model_files, log=log.info) if sorted(output_names) == sorted(stem.output_names): return stem log.info('Removing existing output layers ...') remove_outputs(stem) outputs = mod.add_output_layers(stem.outputs, output_names) model = Model(input=stem.inputs, output=outputs, name=stem.name) return model
def build_model(self): opts = self.opts log = self.log output_names = dat.get_output_names(opts.train_files[0], regex=opts.output_names, nb_key=opts.nb_output) if not output_names: raise ValueError('No outputs found!') dna_model = None if opts.dna_model: dna_model = self.build_dna_model() cpg_model = None if opts.cpg_model: cpg_model = self.build_cpg_model() if dna_model is not None and cpg_model is not None: log.info('Joining models ...') joint_model_builder = mod.joint.get(opts.joint_model)( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) stem = joint_model_builder([dna_model, cpg_model]) stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name]) elif dna_model is not None: stem = dna_model elif cpg_model is not None: stem = cpg_model else: log.info('Loading existing model ...') stem = mod.load_model(opts.model_files, log=log.info) if sorted(output_names) == sorted(stem.output_names): return stem log.info('Removing existing output layers ...') remove_outputs(stem) outputs = mod.add_output_layers(stem.outputs[0], output_names) model = Model(inputs=stem.inputs, outputs=outputs, name=stem.name) return model
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) data_file = h5.File(opts.data_file, 'r') nb_sample = len(data_file['pos']) if opts.nb_sample: nb_sample = min(nb_sample, opts.nb_sample) data = dict() for name in ['chromo', 'pos']: data[name] = data_file[name][:nb_sample] idx = None if opts.chromos: idx = np.in1d(data['chromo'], [chromo.encode() for chromo in opts.chromos]) for key, value in six.iteritems(data): data[key] = value[idx] output_names = dat.get_output_names(opts.data_file, regex=opts.output_names) make_dir(opts.out_dir) for output_name in output_names: log.info(output_name) data['output'] = data_file['outputs'][output_name][:nb_sample] data['pred'] = data_file['preds'][output_name][:nb_sample] if idx is not None: for name in ['output', 'pred']: data[name] = data[name][idx] # Use `output` label if known, otherwise prediction data['value'] = data['pred'] tmp = data['output'] != dat.CPG_NAN data['value'][tmp] = data['output'][tmp] name = output_name.split(dat.OUTPUT_SEP) if name[0] == 'cpg': name = name[-1] else: name = '_'.join(name) out_file = os.path.join(opts.out_dir, name) if opts.out_format == 'bedGraph': write_to_bedGraph(data, out_file + '.bedGraph.gz', compression='gzip') elif opts.out_format == 'hdf': write_to_hdf(data, out_file + '.h5') else: tmp = 'Invalid output format "%s"!' % opts.out_format raise ValueError() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) self.opts = opts self.log = log # Get performance curve functions from names. curve_funs = dict() if opts.curves: for name in opts.curves: curve_funs[name] = get_curve_fun(name) anno_curve_funs = dict() if opts.anno_curves: for name in opts.anno_curves: anno_curve_funs[name] = get_curve_fun(name) log.info('Loading data ...') # Read and sort predictions and outputs. output_names = dat.get_output_names(opts.data_file, regex=opts.output_names, nb_key=opts.nb_output) names = {'chromo': None, 'pos': None, 'outputs': output_names, 'preds': output_names} data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample) data['chromo'] = [chromo.decode() for chromo in data['chromo']] data['chromo'] = np.array(data['chromo']) data = fold_dict(data, nb_level=1) idx = np.lexsort((data['pos'], data['chromo'])) data = slice_dict(data, idx) for chromo in np.unique(data['chromo']): chromo_pos = data['pos'][data['chromo'] == chromo] tmp = np.sort(chromo_pos) assert np.all(chromo_pos == tmp) log.info('%d samples' % len(data['pos'])) reports = [] curves = [] log.info('Evaluating globally ...') # Evaluate performances globally. report = ev.evaluate_outputs(data['outputs'], data['preds']) report['anno'] = ANNO_GLOBAL reports.append(report) pd.set_option('display.width', 1000) print(ev.unstack_report(report)) if curve_funs: # Performance curves. for name, fun in curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = ANNO_GLOBAL curves.append(curve) if opts.anno_files: log.info('Evaluating annotations ...') # Evaluate annotations. for anno_file in opts.anno_files: anno = read_anno_file(anno_file) anno_name = os.path.splitext(os.path.basename(anno_file))[0] idx = annotate(data['chromo'], data['pos'], anno) log.info('%s: %d' % (anno_name, idx.sum())) if idx.sum() < opts.anno_min_sites: log.info('Skipping due to insufficient annotated sites!') continue # Select data at annotated sites. anno_data = slice_dict(data, idx) report = ev.evaluate_outputs(anno_data['outputs'], anno_data['preds']) report['anno'] = anno_name reports.append(report) if curve_funs: # Performance curves. for name, fun in anno_curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve( data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = anno_name curves.append(curve) make_dir(opts.out_dir) if reports: report = pd.concat(reports) report = report[['anno', 'metric', 'output', 'value']] self.save_report(report, 'metrics') if curves: curves = pd.concat(curves) curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']] self.save_report(curves, 'curves') log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) data_file = h5.File(opts.data_file, 'r') nb_sample = len(data_file['pos']) if opts.nb_sample: nb_sample = min(nb_sample, opts.nb_sample) data = dict() for name in ['chromo', 'pos']: data[name] = data_file[name][:nb_sample] idx = None if opts.chromos: idx = np.in1d(data['chromo'], [chromo.encode() for chromo in opts.chromos]) for key, value in six.iteritems(data): data[key] = value[idx] output_names = dat.get_output_names(opts.data_file, regex=opts.output_names) make_dir(opts.out_dir) for output_name in output_names: log.info(output_name) data['output'] = data_file['outputs'][output_name][:nb_sample] data['pred'] = data_file['preds'][output_name][:nb_sample] if idx is not None: for name in ['output', 'pred']: data[name] = data[name][idx] # Use `output` label if known, otherwise prediction data['value'] = data['pred'] tmp = data['output'] != dat.CPG_NAN data['value'][tmp] = data['output'][tmp] name = output_name.split(dat.OUTPUT_SEP) if name[0] == 'cpg': name = name[-1] else: name = '_'.join(name) out_file = os.path.join(opts.out_dir, name) if opts.out_format == 'bedGraph': write_to_bedGraph(data, out_file + '.bedGraph.gz', compression='gzip') elif opts.out_format == 'hdf': write_to_hdf(data, out_file + '.h5') else: tmp = 'Invalid output format "%s"!' % opts.out_format raise ValueError() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) self.opts = opts self.log = log # Get performance curve functions from names. curve_funs = dict() if opts.curves: for name in opts.curves: curve_funs[name] = get_curve_fun(name) anno_curve_funs = dict() if opts.anno_curves: for name in opts.anno_curves: anno_curve_funs[name] = get_curve_fun(name) log.info('Loading data ...') # Read and sort predictions and outputs. output_names = dat.get_output_names(opts.data_file, regex=opts.output_names, nb_key=opts.nb_output) names = { 'chromo': None, 'pos': None, 'outputs': output_names, 'preds': output_names } data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample) data['chromo'] = [chromo.decode() for chromo in data['chromo']] data['chromo'] = np.array(data['chromo']) data = fold_dict(data, nb_level=1) idx = np.lexsort((data['pos'], data['chromo'])) data = slice_dict(data, idx) for chromo in np.unique(data['chromo']): chromo_pos = data['pos'][data['chromo'] == chromo] tmp = np.sort(chromo_pos) assert np.all(chromo_pos == tmp) log.info('%d samples' % len(data['pos'])) reports = [] curves = [] log.info('Evaluating globally ...') # Evaluate performances globally. report = ev.evaluate_outputs(data['outputs'], data['preds']) report['anno'] = ANNO_GLOBAL reports.append(report) pd.set_option('display.width', 1000) print(ev.unstack_report(report)) if curve_funs: # Performance curves. for name, fun in curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = ANNO_GLOBAL curves.append(curve) if opts.anno_files: log.info('Evaluating annotations ...') # Evaluate annotations. for anno_file in opts.anno_files: anno = read_anno_file(anno_file) anno_name = os.path.splitext(os.path.basename(anno_file))[0] idx = annotate(data['chromo'], data['pos'], anno) log.info('%s: %d' % (anno_name, idx.sum())) if idx.sum() < opts.anno_min_sites: log.info('Skipping due to insufficient annotated sites!') continue # Select data at annotated sites. anno_data = slice_dict(data, idx) report = ev.evaluate_outputs(anno_data['outputs'], anno_data['preds']) report['anno'] = anno_name reports.append(report) if curve_funs: # Performance curves. for name, fun in anno_curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = anno_name curves.append(curve) make_dir(opts.out_dir) if reports: report = pd.concat(reports) report = report[['anno', 'metric', 'output', 'value']] self.save_report(report, 'metrics') if curves: curves = pd.concat(curves) curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']] self.save_report(curves, 'curves') log.info('Done!') return 0