def output(stream, peaks, chrom, w=73, strand='+', ): "Outputs peaks to a stream" logger.debug('writing %s peaks on strand %s' % (commify(len(peaks)), strand)) for mid, value in peaks: start, end = mid - w, mid + w stream.write("%s\t%d\t%d\t.\t%f\t%s\n" % (chrom, start, end, value, strand))
def transform(inpname, size, outname=None): """ Transforms reads stored in bedfile to a genetrack input file. Requires at least 6 bed columns to access the strand. """ logger.debug('input %s' % inpname) logger.debug('output %s' % outname) reader = csv.reader(open(inpname), delimiter='\t') # unwind the iterator list(takewhile(lambda x: x[0].startswith('#'), reader)) output = file(outname, 'wt') output.write('##gff-version 3\n') output.write('# created with eland2gff on %s\n' % inpname) output.write('# fixed read lenght of %s\n' % size) for row in reader: chrom, start, strand = row[10], int(row[12]), row[13] end = start + size if strand == 'F': strand = '+' else: strand = '-' result = map(str, [chrom, '.', '.', start, end, '.', strand, '.', '.']) output.write("%s\n" % '\t'.join(result)) output.close()
def predict(inpname, outname, options): """ Generate the peak predictions on a genome wide scale """ if options.strand == TWOSTRAND: logger.info('operating in twostrand mode') if options.index: index = hdflib.PositionalData(fname='', index=inpname, nobuild=True, workdir=options.workdir) else: index = hdflib.PositionalData(fname=inpname, nobuild=True, workdir=options.workdir) fp = file(outname, 'wt') for label in index.labels: table = index.table(label) size = table.cols.idx[-1] info = util.commify(size) logger.info('predicting on %s of total size %s' % (label, info)) lo = 0 hi = min( (size, options.maxsize) ) while True: if lo >= size: break perc = '%.1f%%' % (100.0*lo/size) logger.info('processing %s %s:%s (%s)' % (label, lo, hi, perc)) # get the data res = index.query(start=lo, end=hi, label=label) # exclusion zone w = options.exclude/2 def predict(x, y): fx, fy = fitlib.gaussian_smoothing(x=x, y=y, sigma=options.sigma, epsilon=options.level ) peaks = fitlib.detect_peaks(x=fx, y=fy ) if options.mode != 'all': peaks = fitlib.select_peaks(peaks=peaks, exclusion=options.exclude, threshold=options.level) return peaks if options.strand == TWOSTRAND: # operates in two strand mode for yval, strand in [ (res.fwd, '+'), (res.rev, '-') ]: logger.debug('processing strand %s' % strand) peaks = predict(x=res.idx, y=yval) output(stream=fp, peaks=peaks, chrom=label, w=w, strand=strand) else: # combine strands peaks = predict(x=res.idx, y=res.val) output(stream=fp, peaks=peaks, chrom=label, w=w, strand='+') # switching to a higher interval lo = hi hi += options.maxsize fp.close()
def fix_site_settings(): """ On first run the site information may not be correct. This function updates the site to the correct values """ site, flag = Site.objects.get_or_create( id=settings.SITE_ID ) if site.domain != settings.SITE_DOMAIN: site.domain, site.name=settings.SITE_DOMAIN, settings.SITE_NAME logger.debug('modifying site domain:%s name: %s' % (site.domain, site.name) ) site.save()
def get_project( user, pid, write=True ): "Returns a project for a given user" try: member = models.Member.objects.get( user=user, project__id=pid ) project = member.project project.role = member.role project.is_manager = (project.role == status.MANAGER) except ObjectDoesNotExist, exc: logger.debug( exc ) raise AccessError("You may not access this project")
def get_result(user, rid): "Returns a result for a given user" try: # get the result result = models.Result.objects.get(id=rid) # verifies access rights data = get_data(user, did=result.data.id, write=False) except ObjectDoesNotExist, exc: logger.debug( exc ) raise AccessError("You may not access this project")
def get_data( user, did, write=True ): "Returns a project for a given user" try: data = models.Data.objects.get(id=did) member = models.Member.objects.get( user=user, project=data.project ) project = member.project project.role = member.role project.is_manager = (project.role == status.MANAGER) data.write_access = project.is_manager or (data.owner==user) except ObjectDoesNotExist, exc: logger.debug( exc ) raise AccessError("You may not access this project")
def output( stream, peaks, chrom, w=73, strand='+', ): "Outputs peaks to a stream" logger.debug('writing %s peaks on strand %s' % (commify(len(peaks)), strand)) for mid, value in peaks: start, end = mid - w, mid + w stream.write("%s\t%d\t%d\t.\t%f\t%s\n" % (chrom, start, end, value, strand))
def load_users(fname, options): "Loads users into the database" if not os.path.isfile(fname): logger.error('file not found %s' % fname) return if options.flush: "Resets the database with fresh values" flush_database() # alters site settings fix_site_settings() # this will be the admin password passwd = settings.SECRET_KEY # check the lenght of secret key in non debug modes if not settings.DEBUG and len(passwd) < 5: msg = 'The value of the SECRET_KEY setting is too short. Please make it longer!' logger.error(msg) sys.exit() # shorcut to user creatgion user_get = User.objects.get_or_create # read the file and create the users based on the data in it stream = file(fname) for row in csv.DictReader( stream ): username, email, first_name, last_name = row['username'], row['email'], row['first_name'], row['last_name'] is_superuser = (row['is_superuser'] == 'yes') user, flag = user_get(username=username, email=email, first_name=first_name, last_name=last_name, is_superuser=is_superuser, is_staff=is_superuser) if flag: logger.debug( 'created user: %s' % user.get_full_name() ) if username in ('admin', 'demo', 'public'): # admin user will get the secret key as password user.set_password(passwd) else: if options.test_mode: # in testmode we will set known passwords # used during functional testing user.set_password( passwd + 'X') else: # all other users will need to reset their passwords user.set_unusable_password() user.save()
def upload_processor(request, pid): "Handles the actual data upload" user = request.user if user.is_authenticated() and user.username!='public': if 'upload' in request.POST: count = 0 for i in range(50): # take at most 50 files key = 'File%s' % i if key in request.FILES: count += 1 stream = request.FILES[key] name = html.chop_dirname( stream.name ) logger.debug('%s uploaded file %s' % (user.username, name) ) authorize.create_data(user=user, pid=pid, stream=stream, name=name, info='no information') user.message_set.create(message="Uploaded %s files" % count) if 'simple' in request.POST: return html.redirect("/project/view/%s/" % pid) # this is needed only because the JUPload applet makes a HEAD request return html.response('SUCCESS\n')
def __init__(self, fname, workdir=None, update=False, nobuild=False, index=None): """ Create the PositionalData """ self.fname = fname self.db = None # split the incoming name to find the real name, and base directory basedir, basename = os.path.split(self.fname) # the index may be stored in the workdir if it was specified basedir = workdir or basedir # this is the HDF index name that the file operates on self.index = index or conf.path_join(basedir, "%s.hdf" % basename) # debug messages logger.debug("file path %s" % self.fname) logger.debug("index path %s" % self.index) # no building permitted if nobuild and missing(self.index): raise Exception("No autobuild allowed and no index found at %s" % self.index) # creating indices if these are missing or an update is forced if update or missing(self.index): self.build() # operates on the HDF file self.db = openFile(self.index, mode="r") self.root = self.db.root # shows the internal labels logger.debug("index labels -> %s" % self.labels)
def __init__(self, fname, workdir=None, update=False, nobuild=False, index=None): """ Create the PositionalData """ self.fname = fname self.db = None # split the incoming name to find the real name, and base directory basedir, basename = os.path.split(self.fname) # the index may be stored in the workdir if it was specified basedir = workdir or basedir # this is the HDF index name that the file operates on self.index = index or conf.path_join(basedir, '%s.hdf' % basename) # debug messages logger.debug('file path %s' % self.fname) logger.debug('index path %s' % self.index) # no building permitted if nobuild and missing(self.index): raise Exception('No autobuild allowed and no index found at %s' % self.index) # creating indices if these are missing or an update is forced if update or missing(self.index): self.build() # operates on the HDF file self.db = openFile(self.index, mode='r') self.root = self.db.root # shows the internal labels logger.debug('index labels -> %s' % self.labels)
fwd, rev, val = 1, 0, 1 elif strand == '-': # on reverse strand, 5' is at end idx = int(end) - shift fwd, rev, val = 0, 1, 1 else: # no strand specified, generate interval centers idx = (int(start)+int(end))/2 fwd, rev, val = 0, 0, 1 # it is essential be able to sort the index as a string! fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val)) fp.close() linet = util.commify(linec) logger.debug("parsing %s lines finished in %s" % (linet, timer.report())) # if it is producing coverage then it will expand reads into full intervaals # now let the sorting commence cmd = "sort %s > %s" % (flat, sorted) logger.debug("sorting into '%s'" % sorted) os.system(cmd) logger.debug("sorting finished in %s" % timer.report() ) logger.debug("consolidating into '%s'" % outname) consolidate( sorted, outname, format=format) logger.debug("consolidate finished in %s" % timer.report() ) logger.debug("output saved to '%s'" % outname) logger.debug("full conversion finished in %s" % full.report() )
def transform(inpname, outname, format, shift=0, index=False, options=None): """ Transforms reads stored in bedfile to a genetrack input file. Requires at least 6 bed columns to access the strand. """ # detect file formats if format == "BED": CHROM, START, END, STRAND = 0, 1, 2, 5 elif format == "GFF": CHROM, START, END, STRAND = 0, 3, 4, 6 else: raise Exception('Invalid file format' % format) # two sanity checks, one day someone will thank me if format == 'BED' and inpname.endswith('gff'): raise Exception('BED format on a gff file?') if format == 'GFF' and inpname.endswith('bed'): raise Exception('GFF format on a bed file?') # find the basename of the outputname basename = os.path.basename(outname) # two files store intermediate results flat = conf.tempdata( '%s.flat' % basename ) sorted = conf.tempdata( '%s.sorted' % basename ) # check for track information on first line, # much faster this way than conditional checking on each line fp = file(inpname, 'rU') first = fp.readline() fp.close() # create the reader reader = csv.reader(file(inpname, 'rU'), delimiter='\t') # skip if trackline exists if first.startswith == 'track': reader.next() # unwind the comments list(takewhile(lambda x: x[0].startswith('#'), reader)) # copious timing info for those who enjoy these timer, full = util.Timer(), util.Timer() logger.debug("parsing '%s'" % inpname) logger.debug("output to '%s'" % outname) # create the unsorted output file and apply corrections logger.debug("unsorted flat file '%s'" % flat) fp = file(flat, 'wt') for linec, row in enumerate(reader): try: chrom, start, end, strand = row[CHROM], row[START], row[END], row[STRAND] except Exception, exc: first = row[0][0] # may be hitting the end of the file with other comments if first == '>': break # hit the sequence content of the gff file elif first == '#': continue # hit upon some comments else: logger.error(row) raise Exception(exc) if strand == '+': # on forward strand, 5' is at start idx = int(start) + shift fwd, rev, val = 1, 0, 1 elif strand == '-': # on reverse strand, 5' is at end idx = int(end) - shift fwd, rev, val = 0, 1, 1 else: # no strand specified, generate interval centers idx = (int(start)+int(end))/2 fwd, rev, val = 0, 0, 1 # it is essential be able to sort the index as a string! fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))
help="how many jobs to run in parallel" ) # flushes all content away, drops all database content! parser.add_option( '--server', action="store_true", dest="server", default=False, help="runs as a server and invokes the jobrunner at every delay seconds", ) # parse the argument list options, args = parser.parse_args() logger.disable(options.verbosity) # missing file names if options.server and not options.delay: parser.print_help() else: if options.server: logger.info('server mode, delay=%ss' % options.delay) while 1: # this is used to start multiple jobs with cron (at every minute but # having them actually start up at smaller increments time.sleep(options.delay) execute(limit=options.limit) if not options.server: break else: logger.debug( 'jobserver waiting %ss' % options.delay)
return projects def get_project( user, pid, write=True ): "Returns a project for a given user" try: member = models.Member.objects.get( user=user, project__id=pid ) project = member.project project.role = member.role project.is_manager = (project.role == status.MANAGER) except ObjectDoesNotExist, exc: logger.debug( exc ) raise AccessError("You may not access this project") # write access check on by default if write and not project.is_manager: logger.debug( 'write access with invalid role' ) raise AccessError('You may not change this project') return member.project def get_data( user, did, write=True ): "Returns a project for a given user" try: data = models.Data.objects.get(id=did) member = models.Member.objects.get( user=user, project=data.project ) project = member.project project.role = member.role project.is_manager = (project.role == status.MANAGER) data.write_access = project.is_manager or (data.owner==user) except ObjectDoesNotExist, exc: logger.debug( exc )
def predict(inpname, outname, options): """ Generate the peak predictions on a genome wide scale """ if options.strand == TWOSTRAND: logger.info('operating in twostrand mode') if options.index: index = hdflib.PositionalData(fname='', index=inpname, nobuild=True, workdir=options.workdir) else: index = hdflib.PositionalData(fname=inpname, nobuild=True, workdir=options.workdir) fp = file(outname, 'wt') for label in index.labels: table = index.table(label) size = table.cols.idx[-1] info = util.commify(size) logger.info('predicting on %s of total size %s' % (label, info)) lo = 0 hi = min((size, options.maxsize)) while True: if lo >= size: break perc = '%.1f%%' % (100.0 * lo / size) logger.info('processing %s %s:%s (%s)' % (label, lo, hi, perc)) # get the data res = index.query(start=lo, end=hi, label=label) # exclusion zone w = options.exclude / 2 def predict(x, y): fx, fy = fitlib.gaussian_smoothing(x=x, y=y, sigma=options.sigma, epsilon=options.level) peaks = fitlib.detect_peaks(x=fx, y=fy) if options.mode != 'all': peaks = fitlib.select_peaks(peaks=peaks, exclusion=options.exclude, threshold=options.level) return peaks if options.strand == TWOSTRAND: # operates in two strand mode for yval, strand in [(res.fwd, '+'), (res.rev, '-')]: logger.debug('processing strand %s' % strand) peaks = predict(x=res.idx, y=yval) output(stream=fp, peaks=peaks, chrom=label, w=w, strand=strand) else: # combine strands peaks = predict(x=res.idx, y=res.val) output(stream=fp, peaks=peaks, chrom=label, w=w, strand='+') # switching to a higher interval lo = hi hi += options.maxsize fp.close()