def assemble_params_scnreg(args, logger): """ :return: """ all_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) logger.debug('Identified {} valid groups in input file'.format( len(all_groups))) merge_regions = False with pd.HDFStore(args.inputfile, 'r') as hdf: md = hdf['metadata'] res = md.loc[0, 'resolution'] if res > 0: merge_regions = True logger.debug('Detected - merge regions: {}'.format(merge_regions)) if not args.modelmetadata: fpath_md = args.modelfile.rsplit('.', 1)[0] + '.json' else: fpath_md = args.modelmetadata commons = vars(args) del commons['module_logger'] del commons['execute'] commons['modelmetadata'] = fpath_md arglist = [] for g in all_groups: tmp = dict(commons) tmp['inputgroup'] = g _, tmp['chrom'] = os.path.split(g) tmp['merge'] = merge_regions arglist.append(tmp) logger.debug('Build argument list of size {} to process'.format( len(arglist))) return arglist
def assemble_params_estsig(args): """ :param args: :return: """ all_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) model_md = load_model_metadata(args) commons = { 'modelfile': args.modelfile, 'resolution': int(model_md['resolution']), 'seqfile': args.seqfile, 'targetindex': args.targetindex, 'inputfile': args.inputfile, 'inputgroup': args.inputgroup, 'features': model_md['features'], 'kmers': model_md['kmers'], 'feature_order': model_md['feature_order'], 'nosmooth': args.nosmooth } arglist = [] for g in all_groups: chrom = g.rsplit('/', 1)[1] tmp = dict(commons) tmp['chrom'] = chrom arglist.append(tmp) return arglist
def assemble_worker_params(args): """ :param args: :return: """ commons = dict() commons['inputfile'] = args.inputfile commons['twobitgenome'] = args.twobitgenome commons['features'] = args.features commons['kmers'] = args.kmers commons['timeout'] = args.timeout commons['relax_init'] = args.relaxinit commons['relax_step'] = args.relaxstep commons['relax_limit'] = args.relaxlimit commons['allownomatch'] = args.allownomatch ingroups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) arglist = [] for ig in ingroups: tmp = dict(commons) _, chrom = os.path.split(ig) tmp['chrom'] = chrom fg_grp = ig bg_grp = os.path.join(args.outputgroup, chrom) assert fg_grp.strip('/') != bg_grp.strip('/'), 'Foreground and background groups identical: {} and {}\n' \ 'Please specify a different output group'.format(fg_grp, bg_grp) tmp['group_fg'] = fg_grp tmp['group_bg'] = bg_grp arglist.append(tmp) return arglist
def assemble_clsreg_args(args, logger): """ :param args: :return: """ commons = dict() commons['inputfile'] = args.inputfile commons['addseq'] = args.addseq commons['seqfile'] = args.seqfile commons['features'] = args.features commons['kmers'] = args.kmers commons['tfmotifs'] = args.tfmotifs sigfiles, siglabels, siggroups = build_featurefile_info(args.sigfile) commons['siglabels'] = siglabels commons['sigfiles'] = sigfiles commons['siggroups'] = siggroups roifiles, roilabels, roigroups = build_featurefile_info(args.roifile) commons['roilabels'] = roilabels commons['roifiles'] = roifiles commons['roigroups'] = roigroups commons['roiquant'] = args.roiquant commons['mapfile'] = args.mapfile commons['mapreference'] = args.mapreference check = re.compile(args.selectchroms) posgroups = get_valid_hdf5_groups(args.inputfile, args.posingroup) neggroups = get_valid_hdf5_groups(args.inputfile, args.negingroup) arglist = [] for grp in posgroups: prefix, chrom = os.path.split(grp) if check.match(chrom) is None: logger.debug('Skipping group {}'.format(chrom)) continue neggrp = list(filter(lambda x: x.endswith(chrom), neggroups)) assert len( neggrp) == 1, 'Cannot find negative group for positive {}'.format( grp) neggrp = neggrp[0] tmp = dict(commons) tmp['posingroup'] = grp tmp['negingroup'] = neggrp tmp['posoutgroup'] = os.path.join(args.posoutgroup, chrom) tmp['negoutgroup'] = os.path.join(args.negoutgroup, chrom) tmp['chrom'] = chrom arglist.append(tmp) return arglist
def assemble_scnreg_args(args, logger): """ :param args: :return: """ commons = dict() commons['mapfile'] = args.mapfile commons['mapreference'] = args.mapreference if commons['mapfile'] and os.path.isfile(commons['mapfile']): assembly = get_assembly_info(commons['mapfile'], commons['mapreference']) else: assembly = '' commons['inputfile'] = args.inputfile if assembly: assert assembly in os.path.basename(args.inputfile), \ 'Expected assembly {} in filename {}'.format(assembly, args.inputfile) commons['addseq'] = args.addseq commons['seqfile'] = args.seqfile if assembly and os.path.isfile(args.seqfile): assert assembly in os.path.basename(args.seqfile), \ 'Expected assembly {} in filename {}'.format(assembly, args.seqfile) commons['features'] = args.features commons['kmers'] = args.kmers commons['tfmotifs'] = args.tfmotifs commons['window'] = args.window commons['stepsize'] = args.stepsize sigfiles, siglabels, siggroups = build_featurefile_info( args.sigfile, assembly) commons['siglabels'] = siglabels commons['sigfiles'] = sigfiles commons['siggroups'] = siggroups roifiles, roilabels, roigroups = build_featurefile_info( args.roifile, assembly) commons['roilabels'] = roilabels commons['roifiles'] = roifiles commons['roigroups'] = roigroups commons['roiquant'] = args.roiquant check = re.compile(args.selectchroms) ascfiles, asclabels, ascgroups = build_featurefile_info( args.ascregions, assembly) commons['ascfiles'] = ascfiles commons['asclabels'] = asclabels commons['ascgroups'] = ascgroups ingroups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) arglist = [] for grp in ingroups: prefix, chrom = os.path.split(grp) if check.match(chrom) is None: logger.debug('Skipping group {}'.format(chrom)) continue tmp = dict(commons) tmp['inputgroup'] = grp tmp['outputgroup'] = os.path.join(args.outputgroup, chrom) tmp['chrom'] = chrom arglist.append(tmp) return arglist
def dump_regions(args, logger): """ :param args: :param logger: :return: """ load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) dump = [] with pd.HDFStore(args.inputfile, 'r') as hdf: for lg in load_groups: _, chrom_name = lg.rsplit('/', 1) data = hdf[lg] data['chrom'] = chrom_name assert 'crp_group_index' not in data.columns, 'Column name duplicate: crp_group_index' data['crp_group_index'] = data.index.tolist() assert 'crp_group_path' not in data.columns, 'Column name duplicate: crp_group_path' data['crp_group_path'] = lg dump.append(data) logger.debug('Concatenating all data groups...') dump = pd.concat(dump, axis=0, join='outer', ignore_index=True) dump.sort_values(by=['chrom', 'start', 'end'], inplace=True) dump.reset_index(drop=True, inplace=True) logger.debug('Final dataset size: {}'.format(dump.shape)) dump = rearrange_columns(dump, args.commentheader) col_delim = args.delimiter.strip('"') if args.outputfile in ['stdout', '-']: logger.debug('Dumping data to stdout') out = sys.stdout else: logger.debug('Dumping data to file') out = args.outputfile if args.noheader and not args.rtable: dump.to_csv(out, sep=col_delim, header=False, index=False) elif args.noheader and args.rtable: dump.to_csv(out, sep=col_delim, header=False, index=True, index_label=None) elif not args.noheader and not args.rtable: dump.to_csv(out, sep=col_delim, header=True, index=False, index_label=None) elif not args.noheader and args.rtable: dump.to_csv(out, sep=col_delim, header=True, index=True, index_label=None) else: raise RuntimeError('Impossible combination of parameters...') return
def assemble_worker_params(args): """ :param args: :return: """ ingrp_a = args.inputgroupa if not ingrp_a or ingrp_a in ['default', 'auto']: ingrp_a = get_default_group(args.inputfilea) ingrp_b = args.inputgroupb if not ingrp_b or ingrp_b in ['default', 'auto']: ingrp_b = get_default_group(args.inputfileb) groups_a = get_valid_hdf5_groups(args.inputfilea, ingrp_a) groups_b = get_valid_hdf5_groups(args.inputfileb, ingrp_b) chroms_a = set([os.path.split(g)[1] for g in groups_a]) chroms_b = set([os.path.split(g)[1] for g in groups_b]) chrom_union = chroms_a.intersection(chroms_b) assert chrom_union, 'No shared chromosomes between input files/groups' commons = { 'inputfilea': args.inputfilea, 'inputfileb': args.inputfileb, 'inputgroupa': ingrp_a, 'inputgroupb': ingrp_b, 'mapfile': args.mapfile, 'measure': args.measure, 'roifile': args.roifile, 'roilimit': args.roilimit, 'skipsize': args.skipsize } index_groups = get_mapindex_groups(args.mapfile, args.mapreference) arglist = [] for chrom in chrom_union: tmp = dict(commons) tmp['chrom'] = chrom tmp['loadgroupa'] = os.path.join(ingrp_a, chrom) tmp['loadgroupb'] = os.path.join(ingrp_b, chrom) tmp.update(index_groups[chrom]) arglist.append(tmp) return arglist
def run_apply_model(args): """ :param args: :return: """ logger = args.module_logger logger.debug('Loading model and metadata...') model_md = load_model_metadata(args) logger.debug('Metadata successfully loaded') model = load_model(args.modelfile) logger.debug('Model successfully loaded') md_training = model_md['dataset_info'] drv_trg = md_training['derive_target'] trg_var = md_training['target_var'] if drv_trg and drv_trg is not None: logger.debug( 'Found "derive_target" value in model metadata - overwriting...') args.__setattr__('derivetarget', drv_trg) args.__setattr__('targetvar', '') elif trg_var: logger.debug('Found "target_var" in model metadata - overwriting') args.__setattr__('derivetarget', '') args.__setattr__('targetvar', trg_var) else: raise AssertionError( 'Invalid model metadata - ' 'target variable name has to be specified as ' '"target_var" in section "dataset_info": {}'.format(md_training)) model_type = model_md['model_info']['type'] logger.debug('Loading groups from input data file') load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) if model_type == 'classifier': logger.debug('Applying classification model {} on task: {}'.format( model_md['model_info']['name'], args.task)) _ = run_classification(args, model, model_md, load_groups, logger) elif model_type == 'regressor': logger.debug('Applying regression model {} on task: {}'.format( model_md['model_info']['name'], args.task)) _ = run_regression(args, model, model_md, load_groups, logger) else: raise NotImplementedError( 'No support for model of type: {} ' '(just classifier or regressor are supported)'.format(model_type)) return 0
def run_train_model(args): """ :param args: :return: """ logger = args.module_logger _ = create_filepath(args.modelout, logger) logger.debug('Loading model specification from {}'.format(args.modelspec)) model_spec = json.load(open(args.modelspec)) model = load_model(model_spec['module_path'], model_spec['model_name']) load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) traindata, targets, dtinfo, sminfo, ftinfo = load_ml_dataset(args.inputfile, load_groups, None, args, logger) assert traindata.shape[0] > 1, 'No samples (rows) in training data' assert traindata.shape[1] > 1, 'No features (columns) in training data' if 'preprocess' in model_spec and model_spec['preprocess']: logger.debug('Preprocessing dataset with method: {}'.format(model_spec['preprocessor']['preprocessor_name'])) traindata, prepinfo = apply_preprocessor(traindata, model_spec['preprocessor'], 'train') else: prepinfo = None if targets is not None: assert targets.size == traindata.shape[0], 'Mismatch num targets {} and num samples {}'.format(targets.size, traindata.shape[0]) run_metadata = {'dataset_info': dtinfo, 'sample_info': sminfo, 'feature_info': ftinfo, 'model_info': dict()} if prepinfo is not None: run_metadata['preprocess_info'] = prepinfo logger.debug('Training model') if args.notuning: params = model_spec['default'] model = train_nocv(model, params, traindata, targets, sminfo['weights']) run_metadata['model_info']['params'] = params run_metadata['model_info']['tuned'] = False else: params = model_spec['cvtune'] tune_info = train_gridcv(model, params, traindata, targets, args.cvfolds, args.workers, sminfo['weights']) model = tune_info.best_estimator_ run_metadata['model_info']['params'] = tune_info.best_params_ run_metadata['model_info']['tuned'] = True run_metadata['training_info'] = dict() run_metadata['training_info']['cv_scores'] = simplify_cv_scores(tune_info.cv_results_) run_metadata['training_info']['best_score'] = tune_info.best_score_ run_metadata['training_info']['best_index'] = int(tune_info.best_index_) run_metadata['training_info']['scoring'] = params['scoring'] run_metadata['model_info']['name'] = model_spec['model_name'] run_metadata['model_info']['type'] = model_spec['model_type'] if model_spec['model_type'] == 'classifier': run_metadata['training_info']['class_order'] = list(map(int, model.classes_)) logger.debug('Training finished') if 'store_attributes' in model_spec: logger.debug('Storing user requested model attributes') attribs = extract_model_attributes(model, model_spec['store_attributes'], logger) run_metadata['attribute_info'] = attribs if args.calcweights: raise NotImplementedError('Currently not functional') logger.debug('Saving model and metadata') run_metadata['run_info'] = dict() run_metadata['run_info']['model_spec'] = os.path.basename(args.modelspec) run_metadata['run_info']['model_file'] = os.path.basename(args.modelout) run_metadata['run_info']['train_data'] = os.path.basename(args.inputfile) run_metadata['run_info']['train_group'] = args.inputgroup logger.debug('Writing model file...') with open(args.modelout, 'wb') as outfile: pck.dump(model, outfile) if not args.metadataout: mdout = args.modelout.rsplit('.', 1)[0] + '.json' else: mdout = args.metadataout _ = create_filepath(mdout, logger) logger.debug('Writing model metadata...') with open(mdout, 'w') as outfile: _ = json.dump(run_metadata, outfile, indent=1, sort_keys=True) logger.debug('Done') return 0
def dump_index(args, logger): """ :param args: :param logger: :return: """ with pd.HDFStore(args.inputfile, 'r') as hdf: block_groups = get_valid_hdf5_groups(args.inputfile, '/qt/blocks') assert block_groups, 'Map index file does not contain standard groups: /qt/blocks/qchrom/tchrom' logger.debug('Identified {} blocks in map file'.format( len(block_groups))) query_chroms = sorted(set([b.rsplit('/', 2)[1] for b in block_groups])) target_chroms = sorted(set([b.rsplit('/', 1)[1] for b in block_groups])) if args.mapreference == 'target': block_filter = '/qt/blocks/{bottom}/{top}' top_chroms = target_chroms bottom_chroms = query_chroms else: block_filter = '/qt/blocks/{top}/{bottom}' top_chroms = query_chroms bottom_chroms = target_chroms iter_done = False try: out_dest = set_output(args.outputfile) for top in top_chroms: for bottom in bottom_chroms: selector = block_filter.format(**{ 'top': top, 'bottom': bottom }) if selector not in block_groups: continue logger.debug('Dumping block: {}'.format(selector)) _, qchrom, tchrom = selector.rsplit('/', 2) blocks = hdf[selector] blocks['tchrom'] = tchrom blocks['qchrom'] = qchrom blocks['tstrand'] = '+' blocks.replace({'qstrand': { 1: '+', -1: '-' }}, inplace=True) blocks['blockid'] = blocks.index order, sort_by = get_index_column_order( args.mapreference, args.fullblocks) blocks = blocks[order] blocks.sort_values(sort_by, axis=0, inplace=True) logger.debug('Writing {} rows...'.format(blocks.shape[0])) blocks.to_csv(out_dest, sep='\t', header=False, index=False) iter_done = True out_dest.close() except ValueError: if not iter_done: logger.error('Error raised before dumping map file completed') raise logger.debug('Dumped map index') return