def flush(table, collect, name):
     # commit the changes
     if collect:
         table.append(collect)
         table.flush()
         # nicer information
         size = util.commify(len(table))
         logger.info("table=%s, contains %s rows" % (name, size))
 def flush(table, collect, name):
     # commit the changes
     if collect:
         table.append(collect)
         table.flush()
         # nicer information
         size = util.commify(len(table))
         logger.info('table=%s, contains %s rows' % (name, size))
示例#3
0
def predict(inpname, outname, options):
    """
    Generate the peak predictions on a genome wide scale
    """
    if options.strand == TWOSTRAND:
            logger.info('operating in twostrand mode')

    if options.index:
        index = hdflib.PositionalData(fname='', index=inpname, nobuild=True, workdir=options.workdir)
    else:
        index = hdflib.PositionalData(fname=inpname, nobuild=True, workdir=options.workdir)

    fp = file(outname, 'wt')

    for label in index.labels:
        table = index.table(label)
        size  = table.cols.idx[-1]
        info  = util.commify(size)
        logger.info('predicting on %s of total size %s' % (label, info))
        lo = 0
        hi = min( (size, options.maxsize) )

        while True:
            if lo >= size:
                break
            perc = '%.1f%%' % (100.0*lo/size)
            logger.info('processing %s %s:%s (%s)' % (label, lo, hi, perc))
            
            # get the data
            res = index.query(start=lo, end=hi, label=label)

            
            # exclusion zone
            w = options.exclude/2

            def predict(x, y):
                fx, fy = fitlib.gaussian_smoothing(x=x, y=y, sigma=options.sigma, epsilon=options.level )
                peaks = fitlib.detect_peaks(x=fx, y=fy )
                if options.mode != 'all':
                    peaks = fitlib.select_peaks(peaks=peaks, exclusion=options.exclude, threshold=options.level)
                return peaks

            if options.strand == TWOSTRAND:
                # operates in two strand mode
                for yval, strand in [ (res.fwd, '+'), (res.rev, '-') ]:
                    logger.debug('processing strand %s' % strand)
                    peaks = predict(x=res.idx, y=yval)
                    output(stream=fp, peaks=peaks, chrom=label, w=w, strand=strand)
            else:
                # combine strands
                peaks = predict(x=res.idx, y=res.val)
                output(stream=fp, peaks=peaks, chrom=label, w=w, strand='+')

            # switching to a higher interval
            lo = hi
            hi += options.maxsize
        
    fp.close()
    def build(self):
        "May be overriden to use different parsers and schemas"

        logger.info("file='%s'" % self.fname)
        logger.info("index='%s'" % self.index)

        # check file for existance
        if missing(self.fname):
            raise IOError("missing data %s" % self.fname)

        # provides timing information
        timer = util.Timer()

        # iterate over the file
        reader = csv.reader(file(self.fname, "rt"), delimiter="\t")

        # unwind the reader until it hits the header
        for row in reader:
            if row[0] == "chrom":
                break

        # helper function that flushes a table
        def flush(table, collect, name):
            # commit the changes
            if collect:
                table.append(collect)
                table.flush()
                # nicer information
                size = util.commify(len(table))
                logger.info("table=%s, contains %s rows" % (name, size))

        # print messages at every CHUNK line
        last_chrom = table = None
        db = openFile(self.index, mode="w", title="HDF index database")

        # continue on with reading, optimized for throughput
        # with minimal function calls
        collect = []
        for linec, row in izip(count(1), reader):

            # prints progress on processing, also flushes to periodically
            if (linec % CHUNK) == 0:
                logger.info("... processed %s lines" % util.commify(linec))
                flush(table=table, collect=collect, name=last_chrom)
                collect = []

            # get the values from each row
            chrom, index, fwd, rev, value = row
            fwd, rev, value = float(fwd), float(rev), float(value)

            # flush when switching chromosomes
            if chrom != last_chrom:
                # table==None at the beginning
                if table is not None:
                    # logger.debug("... flushing at line %s" % row)
                    flush(table=table, collect=collect, name=last_chrom)
                    collect = []

                # creates the new HDF table here
                table = db.createTable("/", chrom, PositionalSchema, "label %s" % chrom)
                logger.info("creating table:%s" % chrom)
                last_chrom = chrom

            collect.append((index, fwd, rev, value))

        # flush for last chromosome, report some timing information
        flush(table, collect, chrom)
        lineno = util.commify(linec)
        elapsed = timer.report()
        logger.info("finished inserting %s lines in %s" % (lineno, elapsed))

        # close database
        db.close()
    def build(self):
        "May be overriden to use different parsers and schemas"

        logger.info("file='%s'" % self.fname)
        logger.info("index='%s'" % self.index)

        # check file for existance
        if missing(self.fname):
            raise IOError('missing data %s' % self.fname)

        # provides timing information
        timer = util.Timer()

        # iterate over the file
        reader = csv.reader(file(self.fname, 'rt'), delimiter='\t')

        # unwind the reader until it hits the header
        for row in reader:
            if row[0] == 'chrom':
                break

        # helper function that flushes a table
        def flush(table, collect, name):
            # commit the changes
            if collect:
                table.append(collect)
                table.flush()
                # nicer information
                size = util.commify(len(table))
                logger.info('table=%s, contains %s rows' % (name, size))

        # print messages at every CHUNK line
        last_chrom = table = None
        db = openFile(self.index, mode='w', title='HDF index database')

        # continue on with reading, optimized for throughput
        # with minimal function calls
        collect = []
        for linec, row in izip(count(1), reader):

            # prints progress on processing, also flushes to periodically
            if (linec % CHUNK) == 0:
                logger.info("... processed %s lines" % util.commify(linec))
                flush(table=table, collect=collect, name=last_chrom)
                collect = []

            # get the values from each row
            chrom, index, fwd, rev, value = row
            fwd, rev, value = float(fwd), float(rev), float(value)

            # flush when switching chromosomes
            if chrom != last_chrom:
                # table==None at the beginning
                if table is not None:
                    #logger.debug("... flushing at line %s" % row)
                    flush(table=table, collect=collect, name=last_chrom)
                    collect = []

                # creates the new HDF table here
                table = db.createTable("/", chrom, PositionalSchema,
                                       'label %s' % chrom)
                logger.info("creating table:%s" % chrom)
                last_chrom = chrom

            collect.append((index, fwd, rev, value))

        # flush for last chromosome, report some timing information
        flush(table, collect, chrom)
        lineno = util.commify(linec)
        elapsed = timer.report()
        logger.info("finished inserting %s lines in %s" % (lineno, elapsed))

        # close database
        db.close()
示例#6
0
from django import forms
from genetrack import logger, util

# needs a custom class to create a submit widget
class SubmitWidget( forms.widgets.Input) :
    input_type = 'submit'

# custom widgets
ButtonWidget   = SubmitWidget( attrs={'class': 'nav_btn'} ) 
FeatureWidget  = forms.TextInput( attrs={'size': '10', 'id': 'feature'}) 
ImageWidget    = forms.TextInput( attrs={'size': '4'}) 
FloatWidget    = forms.TextInput( attrs={'size': '1'}) 

# generate zoom levels with user friendly numbers
ZOOM_LEVELS  = "50 100 250 500 1000 2500 5000 10000 25000 50000 100000 250000 500000 1000000".split()
ZOOM_CHOICES = map(lambda x: (x, util.commify(x) ), ZOOM_LEVELS)

def zoom_change(value, step):
    """
    Gets the next zoom level, either up or down

    >>> levels = [(100, 1), (100, -1), (1000000, 1)]
    >>> it = starmap( zoom_change, levels )
    >>> list(it)
    [250, 50, 1000000]
    """
    global ZOOM_LEVELS
    try:
        index = ZOOM_LEVELS.index( str(value) )
        index += step
        index = index if index>0 else 0
示例#7
0
            idx = int(start) + shift
            fwd, rev, val = 1, 0, 1
        elif strand == '-':
            # on reverse strand, 5' is at end
            idx = int(end) - shift
            fwd, rev, val = 0, 1, 1
        else:
            # no strand specified, generate interval centers
            idx = (int(start)+int(end))/2
            fwd, rev, val = 0, 0, 1

        # it is essential be able to sort the index as a string! 
        fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))

    fp.close()
    linet = util.commify(linec)
    logger.debug("parsing %s lines finished in %s" % (linet, timer.report()))

    # if it is producing coverage then it will expand reads into full intervaals

    # now let the sorting commence
    cmd = "sort %s > %s" % (flat, sorted)
    logger.debug("sorting into '%s'" % sorted)
    os.system(cmd)
    logger.debug("sorting finished in %s" % timer.report() )

    logger.debug("consolidating into '%s'" % outname)
    consolidate( sorted, outname, format=format)
    logger.debug("consolidate finished in %s" % timer.report() )
    logger.debug("output saved to '%s'" % outname)
    logger.debug("full conversion finished in %s" % full.report() )
def predict(inpname, outname, options):
    """
    Generate the peak predictions on a genome wide scale
    """
    if options.strand == TWOSTRAND:
        logger.info('operating in twostrand mode')

    if options.index:
        index = hdflib.PositionalData(fname='',
                                      index=inpname,
                                      nobuild=True,
                                      workdir=options.workdir)
    else:
        index = hdflib.PositionalData(fname=inpname,
                                      nobuild=True,
                                      workdir=options.workdir)

    fp = file(outname, 'wt')

    for label in index.labels:
        table = index.table(label)
        size = table.cols.idx[-1]
        info = util.commify(size)
        logger.info('predicting on %s of total size %s' % (label, info))
        lo = 0
        hi = min((size, options.maxsize))

        while True:
            if lo >= size:
                break
            perc = '%.1f%%' % (100.0 * lo / size)
            logger.info('processing %s %s:%s (%s)' % (label, lo, hi, perc))

            # get the data
            res = index.query(start=lo, end=hi, label=label)

            # exclusion zone
            w = options.exclude / 2

            def predict(x, y):
                fx, fy = fitlib.gaussian_smoothing(x=x,
                                                   y=y,
                                                   sigma=options.sigma,
                                                   epsilon=options.level)
                peaks = fitlib.detect_peaks(x=fx, y=fy)
                if options.mode != 'all':
                    peaks = fitlib.select_peaks(peaks=peaks,
                                                exclusion=options.exclude,
                                                threshold=options.level)
                return peaks

            if options.strand == TWOSTRAND:
                # operates in two strand mode
                for yval, strand in [(res.fwd, '+'), (res.rev, '-')]:
                    logger.debug('processing strand %s' % strand)
                    peaks = predict(x=res.idx, y=yval)
                    output(stream=fp,
                           peaks=peaks,
                           chrom=label,
                           w=w,
                           strand=strand)
            else:
                # combine strands
                peaks = predict(x=res.idx, y=res.val)
                output(stream=fp, peaks=peaks, chrom=label, w=w, strand='+')

            # switching to a higher interval
            lo = hi
            hi += options.maxsize

    fp.close()