예제 #1
0
파일: pipeline.py 프로젝트: bjpop/banzai
The pipeline is configured by an options file in YAML format,
including the actual commands which are run at each stage.
'''

from ruffus import *
import os.path
import shutil
from utils import (runStage, splitPath, getOptions, initLog, getCommand)
from chrom_info import (chromInfo)
import sys
import glob

# Read the configuation options from file, determine the reference file
# and list of sequence files.
options = getOptions()
reference = options['reference']
#sequences = options['sequences']
sequencePatterns = options['sequences']
sequences = []
if type(sequencePatterns) == list:
    for pattern in sequencePatterns:
        sequences.append(glob.glob(pattern))
else:
    sequences = glob.glob(sequencePatterns)
# Start the logging process.
logger = initLog(options)
# Get information about chromosomes in the reference file
chromosomes = chromInfo(reference)

# Index the reference file.
예제 #2
0
    datas = utils.getDataInFile(dataFileName)
    querys = utils.readDataFromFile(queryFileName)
    vps = utils.readDataFromFile(vpFileName)
    print len(datas), len(querys), len(vps)

    cdsDatas = []
    for i in xrange(len(datas)):
        t = []
        for j in xrange(len(vps)):
            t.append(utils.hammingDistance(datas[i], vps[j]))
        cdsDatas.append(t)
    utils.writeDataToFile(cdsDataFileName, cdsDatas)

    cdsQuerys = []
    for i in xrange(len(querys)):
        t = []
        for j in xrange(len(vps)):
            t.append(utils.hammingDistance(querys[i], vps[j]))
        cdsQuerys.append(t)
    utils.writeDataToFile(cdsQueryFileName, cdsQuerys)
    print cdsDataFileName, cdsQueryFileName


if __name__ == '__main__':
    utils.createDirectory('cds_data')
    utils.createDirectory('cds_query')

    options = utils.getOptions()
    convertNDDSToCDS(options)
예제 #3
0
    getCornerPoints(0,[],dim,alphabet,cardinality)
    vps = []
    vps.append([ 'A' for i in xrange(dim) ])
    #vps.append([ 'B' for i in xrange(dim) ])
    #vps.append([ 'C' for i in xrange(dim) ])
    #vps.append([ 'D' for i in xrange(dim) ])
    for i in xrange(numberOfVP - 1):
        print i
        #mx, mx_idx = (0,0)
        mn, mn_idx = (987654321.0,0)
        for j in xrange(len(cornerPoints)):
            variation = getTotalCostFunction(vps,cornerPoints[j],base)
            if mn > variation:
                mn, mn_idx = (variation,j)
            #if mx < dist:
            #    mx, mx_idx = (dist,j)
        print mn,mn_idx,cornerPoints[mn_idx]
        vps.append(cornerPoints[mn_idx])
    utils.writeDataToFile('vp/vp_%d_%d_%d_greedy.txt'%(dim,numberOfVP,cardinality),vps)


if __name__ == '__main__':
    options = utils.getOptions()

    utils.createDirectory('vp')
    #generateVantagePoints(options)
    #generateGreedyVantagePoints(options)
    #generateAllRandomVantagePoints(options)
    generateHeuristicVantagePoints(options)
예제 #4
0
It supports parallel evaluation of independent pipeline stages,
and can run stages on a cluster environment.

The pipeline is configured by an options file in YAML format,
including the actual commands which are run at each stage.
'''

from ruffus import *
import os.path
import shutil
from utils import (runStage, splitPath, getOptions, initLog, getCommand)

# Read the configuation options from file, determine the reference file
# and list of sequence files.
options = getOptions()
reference = options['reference']
sequences = options['sequences']
isPairedEnd = options['pipeline']['paired']
# Start the logging process.
logger = initLog(options)


# Index the reference file.
@files(reference, reference + '.bwt', logger)
def mkRefDataBase(reference, output, logger):
    runStage('mkRefDataBase', logger, options, reference, output)


# Index the reference file.
# XXX not sure why we need to do both mkRefDataBase and indexReference.
Description:

Simple pipeline to demonstrate how to use the base tools.
Counts the number of lines in a set of files and then sums
them up.

'''

import sys
from ruffus import *
from utils import (runStageCheck, getOptions, initLog)
from cmdline_args import get_cmdline_args

args = get_cmdline_args()
options = getOptions(args)
logDir = options.pipeline['logDir']
logger = initLog(options)

# the input files
data_files = ['test_data/data1.txt', 'test_data/data2.txt']

# count the number of lines in a file
@transform(data_files, suffix('.txt'), ['.count', '.count.Success'])
def countLines(file, outputs):
    output,flagFile = outputs
    runStageCheck('countLines', flagFile, logger, options, file, output)

# sum the counts from the previous stage
@merge(countLines,  ['test_data/total.txt', 'test_data/total.Success'])
def total(files, outputs):
예제 #6
0
파일: rubra.py 프로젝트: scwatts/rubra
def main():

    args = get_cmdline_args()

    # We want to look for modules in the directory local to the pipeline,
    # just as if the pipeline script had been called directly.
    # This includes the script itself and the config files imported by getOptions
    sys.path.insert(0, os.path.dirname(args.pipeline))

    # options must be set before pipeline is imported
    options = getOptions(args)
    setOptions(options)

    # import the pipeline so its stages are defined
    # the name of the pipeline is given on the command line
    __import__(drop_py_suffix(args.pipeline))

    logDir = options.pipeline['logDir']
    startLogger()
    pipelineOptions = options.pipeline
    endTasks = pipelineOptions['end']
    forcedTasks = pipelineOptions['force']
    style = pipelineOptions['style']
    if pipelineOptions['rebuild'] == 'fromstart':
        rebuildMode = True
    elif pipelineOptions['rebuild'] == 'fromend':
        rebuildMode = False
    else:
        rebuildMode = True
    if style in ['run', 'touchfiles']:
        touchfiles_flag = (style == 'touchfiles')
        # Perform the pipeline steps (run the pipeline).
        pipeline_run(
            # End points of the pipeline.
            endTasks,
            # How many ruffus tasks to run.
            multiprocess=pipelineOptions['procs'],
            logger=black_hole_logger,
            # Force the pipeline to start from here, regardless of whether the
            # stage is up-to-date or not.
            forcedtorun_tasks=forcedTasks,
            # If the style was touchfiles, we will set a flag to bring
            # files up to date without running anything
            touch_files_only=touchfiles_flag,
            # Choose the mode in which ruffus decides how much work needs to be
            # done.
            gnu_make_maximal_rebuild_mode=rebuildMode)
    elif style == 'flowchart':
        # Draw the pipeline as a diagram.
        pipeline_printout_graph('flowchart.svg',
                                'svg',
                                endTasks,
                                no_key_legend=False)
    elif style == 'print':
        # Print a textual description of what the piplines would do,
        #but don't actuall run it.
        pipeline_printout(sys.stdout,
                          endTasks,
                          verbose=5,
                          wrap_width=100000,
                          forcedtorun_tasks=forcedTasks,
                          gnu_make_maximal_rebuild_mode=rebuildMode)
예제 #7
0
Description:

Simple pipeline to demonstrate how to use the base tools.
Counts the number of lines in a set of files and then sums
them up.

'''

import sys
from ruffus import *
from utils import (runStageCheck, getOptions, initLog)
from cmdline_args import get_cmdline_args

args = get_cmdline_args()
options = getOptions(args)
logDir = options.pipeline['logDir']
logger = initLog(options)

# the input files
data_files = ['test_data/data1.txt', 'test_data/data2.txt']


# count the number of lines in a file
@transform(data_files, suffix('.txt'), ['.count', '.count.Success'])
def countLines(file, outputs):
    output, flagFile = outputs
    runStageCheck('countLines', flagFile, logger, options, file, output)


# sum the counts from the previous stage
예제 #8
0
파일: rubra.py 프로젝트: TBBSS/rubra
def main():

    args = get_cmdline_args()

    # We want to look for modules in the directory local to the pipeline,
    # just as if the pipeline script had been called directly.
    # This includes the script itself and the config files imported by getOptions
    sys.path.insert(0, os.path.dirname(args.pipeline))

    # options must be set before pipeline is imported
    options = getOptions(args)
    setOptions(options)

    # import the pipeline so its stages are defined
    # the name of the pipeline is given on the command line
    __import__(drop_py_suffix(args.pipeline))

    logDir = options.pipeline['logDir']
    startLogger()
    pipelineOptions = options.pipeline
    endTasks = pipelineOptions['end']
    forcedTasks = pipelineOptions['force']
    style = pipelineOptions['style']
    if pipelineOptions['rebuild'] == 'fromstart':
        rebuildMode = True
    elif pipelineOptions['rebuild'] == 'fromend':
        rebuildMode = False
    else:
        rebuildMode = True
    if style in ['run', 'touchfiles']:
        touchfiles_flag = (style=='touchfiles')
        # Perform the pipeline steps (run the pipeline).
        pipeline_run(
            # End points of the pipeline.
            endTasks,
            # How many ruffus tasks to run.
            multiprocess=pipelineOptions['procs'],
            logger=black_hole_logger,
            # Force the pipeline to start from here, regardless of whether the
            # stage is up-to-date or not.
            forcedtorun_tasks=forcedTasks,
            # If the style was touchfiles, we will set a flag to bring 
            # files up to date without running anything
            touch_files_only=touchfiles_flag,
            # Choose the mode in which ruffus decides how much work needs to be
            # done.
            gnu_make_maximal_rebuild_mode=rebuildMode)
    elif style == 'flowchart':
        # Draw the pipeline as a diagram.
        pipeline_printout_graph(
            'flowchart.svg',
            'svg',
            endTasks,
            no_key_legend=False)
    elif style == 'print':
        # Print a textual description of what the piplines would do,
        #but don't actuall run it.
        pipeline_printout(
            sys.stdout,
            endTasks,
            verbose=5,
            wrap_width=100000,
            forcedtorun_tasks=forcedTasks,
            gnu_make_maximal_rebuild_mode=rebuildMode)