Exemplo n.º 1
0
def assemble_params_scnreg(args, logger):
    """
    :return:
    """
    all_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    logger.debug('Identified {} valid groups in input file'.format(
        len(all_groups)))
    merge_regions = False
    with pd.HDFStore(args.inputfile, 'r') as hdf:
        md = hdf['metadata']
        res = md.loc[0, 'resolution']
        if res > 0:
            merge_regions = True
    logger.debug('Detected - merge regions: {}'.format(merge_regions))
    if not args.modelmetadata:
        fpath_md = args.modelfile.rsplit('.', 1)[0] + '.json'
    else:
        fpath_md = args.modelmetadata
    commons = vars(args)
    del commons['module_logger']
    del commons['execute']
    commons['modelmetadata'] = fpath_md
    arglist = []
    for g in all_groups:
        tmp = dict(commons)
        tmp['inputgroup'] = g
        _, tmp['chrom'] = os.path.split(g)
        tmp['merge'] = merge_regions
        arglist.append(tmp)
    logger.debug('Build argument list of size {} to process'.format(
        len(arglist)))
    return arglist
Exemplo n.º 2
0
def assemble_params_estsig(args):
    """
    :param args:
    :return:
    """
    all_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    model_md = load_model_metadata(args)
    commons = {
        'modelfile': args.modelfile,
        'resolution': int(model_md['resolution']),
        'seqfile': args.seqfile,
        'targetindex': args.targetindex,
        'inputfile': args.inputfile,
        'inputgroup': args.inputgroup,
        'features': model_md['features'],
        'kmers': model_md['kmers'],
        'feature_order': model_md['feature_order'],
        'nosmooth': args.nosmooth
    }
    arglist = []
    for g in all_groups:
        chrom = g.rsplit('/', 1)[1]
        tmp = dict(commons)
        tmp['chrom'] = chrom
        arglist.append(tmp)
    return arglist
Exemplo n.º 3
0
def assemble_worker_params(args):
    """
    :param args:
    :return:
    """
    commons = dict()
    commons['inputfile'] = args.inputfile
    commons['twobitgenome'] = args.twobitgenome
    commons['features'] = args.features
    commons['kmers'] = args.kmers
    commons['timeout'] = args.timeout
    commons['relax_init'] = args.relaxinit
    commons['relax_step'] = args.relaxstep
    commons['relax_limit'] = args.relaxlimit
    commons['allownomatch'] = args.allownomatch
    ingroups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    arglist = []
    for ig in ingroups:
        tmp = dict(commons)
        _, chrom = os.path.split(ig)
        tmp['chrom'] = chrom
        fg_grp = ig
        bg_grp = os.path.join(args.outputgroup, chrom)
        assert fg_grp.strip('/') != bg_grp.strip('/'), 'Foreground and background groups identical: {} and {}\n' \
                                                       'Please specify a different output group'.format(fg_grp, bg_grp)

        tmp['group_fg'] = fg_grp
        tmp['group_bg'] = bg_grp
        arglist.append(tmp)
    return arglist
Exemplo n.º 4
0
def assemble_clsreg_args(args, logger):
    """
    :param args:
    :return:
    """
    commons = dict()
    commons['inputfile'] = args.inputfile
    commons['addseq'] = args.addseq
    commons['seqfile'] = args.seqfile
    commons['features'] = args.features
    commons['kmers'] = args.kmers
    commons['tfmotifs'] = args.tfmotifs
    sigfiles, siglabels, siggroups = build_featurefile_info(args.sigfile)
    commons['siglabels'] = siglabels
    commons['sigfiles'] = sigfiles
    commons['siggroups'] = siggroups
    roifiles, roilabels, roigroups = build_featurefile_info(args.roifile)
    commons['roilabels'] = roilabels
    commons['roifiles'] = roifiles
    commons['roigroups'] = roigroups
    commons['roiquant'] = args.roiquant
    commons['mapfile'] = args.mapfile
    commons['mapreference'] = args.mapreference
    check = re.compile(args.selectchroms)
    posgroups = get_valid_hdf5_groups(args.inputfile, args.posingroup)
    neggroups = get_valid_hdf5_groups(args.inputfile, args.negingroup)
    arglist = []
    for grp in posgroups:
        prefix, chrom = os.path.split(grp)
        if check.match(chrom) is None:
            logger.debug('Skipping group {}'.format(chrom))
            continue
        neggrp = list(filter(lambda x: x.endswith(chrom), neggroups))
        assert len(
            neggrp) == 1, 'Cannot find negative group for positive {}'.format(
                grp)
        neggrp = neggrp[0]
        tmp = dict(commons)
        tmp['posingroup'] = grp
        tmp['negingroup'] = neggrp
        tmp['posoutgroup'] = os.path.join(args.posoutgroup, chrom)
        tmp['negoutgroup'] = os.path.join(args.negoutgroup, chrom)
        tmp['chrom'] = chrom
        arglist.append(tmp)
    return arglist
Exemplo n.º 5
0
def assemble_scnreg_args(args, logger):
    """
    :param args:
    :return:
    """
    commons = dict()
    commons['mapfile'] = args.mapfile
    commons['mapreference'] = args.mapreference
    if commons['mapfile'] and os.path.isfile(commons['mapfile']):
        assembly = get_assembly_info(commons['mapfile'],
                                     commons['mapreference'])
    else:
        assembly = ''
    commons['inputfile'] = args.inputfile
    if assembly:
        assert assembly in os.path.basename(args.inputfile), \
            'Expected assembly {} in filename {}'.format(assembly, args.inputfile)
    commons['addseq'] = args.addseq
    commons['seqfile'] = args.seqfile
    if assembly and os.path.isfile(args.seqfile):
        assert assembly in os.path.basename(args.seqfile), \
            'Expected assembly {} in filename {}'.format(assembly, args.seqfile)
    commons['features'] = args.features
    commons['kmers'] = args.kmers
    commons['tfmotifs'] = args.tfmotifs
    commons['window'] = args.window
    commons['stepsize'] = args.stepsize
    sigfiles, siglabels, siggroups = build_featurefile_info(
        args.sigfile, assembly)
    commons['siglabels'] = siglabels
    commons['sigfiles'] = sigfiles
    commons['siggroups'] = siggroups
    roifiles, roilabels, roigroups = build_featurefile_info(
        args.roifile, assembly)
    commons['roilabels'] = roilabels
    commons['roifiles'] = roifiles
    commons['roigroups'] = roigroups
    commons['roiquant'] = args.roiquant
    check = re.compile(args.selectchroms)
    ascfiles, asclabels, ascgroups = build_featurefile_info(
        args.ascregions, assembly)
    commons['ascfiles'] = ascfiles
    commons['asclabels'] = asclabels
    commons['ascgroups'] = ascgroups
    ingroups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    arglist = []
    for grp in ingroups:
        prefix, chrom = os.path.split(grp)
        if check.match(chrom) is None:
            logger.debug('Skipping group {}'.format(chrom))
            continue
        tmp = dict(commons)
        tmp['inputgroup'] = grp
        tmp['outputgroup'] = os.path.join(args.outputgroup, chrom)
        tmp['chrom'] = chrom
        arglist.append(tmp)
    return arglist
Exemplo n.º 6
0
def dump_regions(args, logger):
    """
    :param args:
    :param logger:
    :return:
    """
    load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    dump = []
    with pd.HDFStore(args.inputfile, 'r') as hdf:
        for lg in load_groups:
            _, chrom_name = lg.rsplit('/', 1)
            data = hdf[lg]
            data['chrom'] = chrom_name
            assert 'crp_group_index' not in data.columns, 'Column name duplicate: crp_group_index'
            data['crp_group_index'] = data.index.tolist()
            assert 'crp_group_path' not in data.columns, 'Column name duplicate: crp_group_path'
            data['crp_group_path'] = lg
            dump.append(data)
    logger.debug('Concatenating all data groups...')
    dump = pd.concat(dump, axis=0, join='outer', ignore_index=True)
    dump.sort_values(by=['chrom', 'start', 'end'], inplace=True)
    dump.reset_index(drop=True, inplace=True)
    logger.debug('Final dataset size: {}'.format(dump.shape))
    dump = rearrange_columns(dump, args.commentheader)
    col_delim = args.delimiter.strip('"')
    if args.outputfile in ['stdout', '-']:
        logger.debug('Dumping data to stdout')
        out = sys.stdout
    else:
        logger.debug('Dumping data to file')
        out = args.outputfile
    if args.noheader and not args.rtable:
        dump.to_csv(out, sep=col_delim, header=False, index=False)
    elif args.noheader and args.rtable:
        dump.to_csv(out,
                    sep=col_delim,
                    header=False,
                    index=True,
                    index_label=None)
    elif not args.noheader and not args.rtable:
        dump.to_csv(out,
                    sep=col_delim,
                    header=True,
                    index=False,
                    index_label=None)
    elif not args.noheader and args.rtable:
        dump.to_csv(out,
                    sep=col_delim,
                    header=True,
                    index=True,
                    index_label=None)
    else:
        raise RuntimeError('Impossible combination of parameters...')
    return
Exemplo n.º 7
0
def assemble_worker_params(args):
    """
    :param args:
    :return:
    """
    ingrp_a = args.inputgroupa
    if not ingrp_a or ingrp_a in ['default', 'auto']:
        ingrp_a = get_default_group(args.inputfilea)
    ingrp_b = args.inputgroupb
    if not ingrp_b or ingrp_b in ['default', 'auto']:
        ingrp_b = get_default_group(args.inputfileb)

    groups_a = get_valid_hdf5_groups(args.inputfilea, ingrp_a)
    groups_b = get_valid_hdf5_groups(args.inputfileb, ingrp_b)

    chroms_a = set([os.path.split(g)[1] for g in groups_a])
    chroms_b = set([os.path.split(g)[1] for g in groups_b])
    chrom_union = chroms_a.intersection(chroms_b)
    assert chrom_union, 'No shared chromosomes between input files/groups'
    commons = {
        'inputfilea': args.inputfilea,
        'inputfileb': args.inputfileb,
        'inputgroupa': ingrp_a,
        'inputgroupb': ingrp_b,
        'mapfile': args.mapfile,
        'measure': args.measure,
        'roifile': args.roifile,
        'roilimit': args.roilimit,
        'skipsize': args.skipsize
    }
    index_groups = get_mapindex_groups(args.mapfile, args.mapreference)
    arglist = []
    for chrom in chrom_union:
        tmp = dict(commons)
        tmp['chrom'] = chrom
        tmp['loadgroupa'] = os.path.join(ingrp_a, chrom)
        tmp['loadgroupb'] = os.path.join(ingrp_b, chrom)
        tmp.update(index_groups[chrom])
        arglist.append(tmp)
    return arglist
Exemplo n.º 8
0
def run_apply_model(args):
    """
    :param args:
    :return:
    """
    logger = args.module_logger
    logger.debug('Loading model and metadata...')
    model_md = load_model_metadata(args)
    logger.debug('Metadata successfully loaded')
    model = load_model(args.modelfile)
    logger.debug('Model successfully loaded')
    md_training = model_md['dataset_info']
    drv_trg = md_training['derive_target']
    trg_var = md_training['target_var']
    if drv_trg and drv_trg is not None:
        logger.debug(
            'Found "derive_target" value in model metadata - overwriting...')
        args.__setattr__('derivetarget', drv_trg)
        args.__setattr__('targetvar', '')
    elif trg_var:
        logger.debug('Found "target_var" in model metadata - overwriting')
        args.__setattr__('derivetarget', '')
        args.__setattr__('targetvar', trg_var)
    else:
        raise AssertionError(
            'Invalid model metadata - '
            'target variable name has to be specified as '
            '"target_var" in section "dataset_info": {}'.format(md_training))
    model_type = model_md['model_info']['type']
    logger.debug('Loading groups from input data file')
    load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    if model_type == 'classifier':
        logger.debug('Applying classification model {} on task: {}'.format(
            model_md['model_info']['name'], args.task))
        _ = run_classification(args, model, model_md, load_groups, logger)
    elif model_type == 'regressor':
        logger.debug('Applying regression model {} on task: {}'.format(
            model_md['model_info']['name'], args.task))
        _ = run_regression(args, model, model_md, load_groups, logger)
    else:
        raise NotImplementedError(
            'No support for model of type: {} '
            '(just classifier or regressor are supported)'.format(model_type))
    return 0
Exemplo n.º 9
0
def run_train_model(args):
    """
    :param args:
    :return:
    """
    logger = args.module_logger
    _ = create_filepath(args.modelout, logger)
    logger.debug('Loading model specification from {}'.format(args.modelspec))
    model_spec = json.load(open(args.modelspec))
    model = load_model(model_spec['module_path'], model_spec['model_name'])
    load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup)
    traindata, targets, dtinfo, sminfo, ftinfo = load_ml_dataset(args.inputfile, load_groups, None, args, logger)
    assert traindata.shape[0] > 1, 'No samples (rows) in training data'
    assert traindata.shape[1] > 1, 'No features (columns) in training data'
    if 'preprocess' in model_spec and model_spec['preprocess']:
        logger.debug('Preprocessing dataset with method: {}'.format(model_spec['preprocessor']['preprocessor_name']))
        traindata, prepinfo = apply_preprocessor(traindata, model_spec['preprocessor'], 'train')
    else:
        prepinfo = None
    if targets is not None:
        assert targets.size == traindata.shape[0], 'Mismatch num targets {} and num samples {}'.format(targets.size, traindata.shape[0])
    run_metadata = {'dataset_info': dtinfo, 'sample_info': sminfo,
                    'feature_info': ftinfo, 'model_info': dict()}
    if prepinfo is not None:
        run_metadata['preprocess_info'] = prepinfo
    logger.debug('Training model')
    if args.notuning:
        params = model_spec['default']
        model = train_nocv(model, params, traindata, targets, sminfo['weights'])
        run_metadata['model_info']['params'] = params
        run_metadata['model_info']['tuned'] = False
    else:
        params = model_spec['cvtune']
        tune_info = train_gridcv(model, params, traindata, targets, args.cvfolds, args.workers, sminfo['weights'])
        model = tune_info.best_estimator_
        run_metadata['model_info']['params'] = tune_info.best_params_
        run_metadata['model_info']['tuned'] = True
        run_metadata['training_info'] = dict()
        run_metadata['training_info']['cv_scores'] = simplify_cv_scores(tune_info.cv_results_)
        run_metadata['training_info']['best_score'] = tune_info.best_score_
        run_metadata['training_info']['best_index'] = int(tune_info.best_index_)
        run_metadata['training_info']['scoring'] = params['scoring']
    run_metadata['model_info']['name'] = model_spec['model_name']
    run_metadata['model_info']['type'] = model_spec['model_type']
    if model_spec['model_type'] == 'classifier':
        run_metadata['training_info']['class_order'] = list(map(int, model.classes_))
    logger.debug('Training finished')
    if 'store_attributes' in model_spec:
        logger.debug('Storing user requested model attributes')
        attribs = extract_model_attributes(model, model_spec['store_attributes'], logger)
        run_metadata['attribute_info'] = attribs
    if args.calcweights:
        raise NotImplementedError('Currently not functional')

    logger.debug('Saving model and metadata')
    run_metadata['run_info'] = dict()
    run_metadata['run_info']['model_spec'] = os.path.basename(args.modelspec)
    run_metadata['run_info']['model_file'] = os.path.basename(args.modelout)
    run_metadata['run_info']['train_data'] = os.path.basename(args.inputfile)
    run_metadata['run_info']['train_group'] = args.inputgroup

    logger.debug('Writing model file...')
    with open(args.modelout, 'wb') as outfile:
        pck.dump(model, outfile)

    if not args.metadataout:
        mdout = args.modelout.rsplit('.', 1)[0] + '.json'
    else:
        mdout = args.metadataout
    _ = create_filepath(mdout, logger)
    logger.debug('Writing model metadata...')
    with open(mdout, 'w') as outfile:
        _ = json.dump(run_metadata, outfile, indent=1, sort_keys=True)
    logger.debug('Done')
    return 0
Exemplo n.º 10
0
def dump_index(args, logger):
    """
    :param args:
    :param logger:
    :return:
    """
    with pd.HDFStore(args.inputfile, 'r') as hdf:
        block_groups = get_valid_hdf5_groups(args.inputfile, '/qt/blocks')
        assert block_groups, 'Map index file does not contain standard groups: /qt/blocks/qchrom/tchrom'
        logger.debug('Identified {} blocks in map file'.format(
            len(block_groups)))
        query_chroms = sorted(set([b.rsplit('/', 2)[1] for b in block_groups]))
        target_chroms = sorted(set([b.rsplit('/', 1)[1]
                                    for b in block_groups]))
        if args.mapreference == 'target':
            block_filter = '/qt/blocks/{bottom}/{top}'
            top_chroms = target_chroms
            bottom_chroms = query_chroms
        else:
            block_filter = '/qt/blocks/{top}/{bottom}'
            top_chroms = query_chroms
            bottom_chroms = target_chroms
        iter_done = False
        try:
            out_dest = set_output(args.outputfile)
            for top in top_chroms:
                for bottom in bottom_chroms:
                    selector = block_filter.format(**{
                        'top': top,
                        'bottom': bottom
                    })
                    if selector not in block_groups:
                        continue
                    logger.debug('Dumping block: {}'.format(selector))
                    _, qchrom, tchrom = selector.rsplit('/', 2)
                    blocks = hdf[selector]
                    blocks['tchrom'] = tchrom
                    blocks['qchrom'] = qchrom
                    blocks['tstrand'] = '+'
                    blocks.replace({'qstrand': {
                        1: '+',
                        -1: '-'
                    }},
                                   inplace=True)
                    blocks['blockid'] = blocks.index
                    order, sort_by = get_index_column_order(
                        args.mapreference, args.fullblocks)
                    blocks = blocks[order]
                    blocks.sort_values(sort_by, axis=0, inplace=True)
                    logger.debug('Writing {} rows...'.format(blocks.shape[0]))
                    blocks.to_csv(out_dest,
                                  sep='\t',
                                  header=False,
                                  index=False)
            iter_done = True
            out_dest.close()
        except ValueError:
            if not iter_done:
                logger.error('Error raised before dumping map file completed')
                raise
    logger.debug('Dumped map index')
    return