def aln_from_bedfile(self, bed): '''Given a bed file name, retrieve the associated database Alignment and Lane objects.''' LOGGER.info("Processing bed file: %s", bed) # A quick sanity check to try and make sure we don't load a file # against the wrong genome. This remains fallible since some # genome codes are quite short and might occur in a filename by # chance. if not re.search(self.genome, bed, re.I): # Note do not merge this re.I change into repackaging branch (it is unnecessary). raise ValueError("Genome code not found in bed file name." + " Loading against the wrong genome (%s)?" % self.genome) (code, facility, lanenum, _pipeline) = parse_repository_filename(bed) lanelist = Lane.objects.filter(library__code=code, facility__code=facility, lanenum=lanenum) if lanelist.count() == 0: raise ValueError("Could not find lane for '%s'" % (bed)) elif lanelist.count() > 1: raise ValueError(("Found multiple lanes for '%s': " % (bed,)) + ", ".join([x.id for x in lanelist])) else: lane = lanelist[0] aln = self._create_alignment(bed, lane) return (aln, lane)
def setLaneStatusByFilename(self, fname, status): '''Changes status of the lane matching the library/facility/lanenum info extracted from the file name.''' # 1. Get lane_id matching library/facility/lanenum (library, facility, lanenum, pipeline) = parse_repository_filename(fname) # 2. Find lane and create one if not found try: lib = Library.objects.search_by_name(library) except Library.DoesNotExist, _err: raise SystemExit("No library %s in repository." % library)
class RepoFileHandler(object): '''Class which is almost certainly overkill given the limited functionality left in this script, post-refactor.''' @staticmethod def run(fns, md5files=False, archive=None, md5sums=None): '''Main entry point for the class.''' arc = None arc_date = None if archive is not None: try: arc = ArchiveLocation.objects.get(name=archive) arc_date = datetime.date.today() except ArchiveLocation.DoesNotExist, _err: raise SystemExit("No ArchiveLocation with name '%s'" % archive) i = 0 for fname in fns: # We assume that the file names in the list may correspond to different lanes. # Hence, we search lane for each file again. lane = get_lane_for_file(fname) # Even though fname was already parsed in get_lane_for_file, parse it again as we need the pipeline value (code, facility, lanenum, pipeline) = parse_repository_filename(fname) # if md5sums have been provided if md5sums is not None: chksum = md5sums[i] i += 1 else: # if md5sum is available in .md5 file on the file location if md5files: chksum = checksum_from_file(fname) # As a last resort, try to compute md5sum if chksum is None: chksum = checksum_file(fname) filetype = Filetype.objects.guess_type(fname) basefn = os.path.split(fname)[1] fnparts = os.path.splitext(basefn) if fnparts[1] == '.gz': basefn = fnparts[0] LOGGER.debug("basefn: '%s'" % (basefn)) lanefile = Lanefile(filename=basefn, checksum=chksum, filetype=filetype, lane=lane, pipeline=pipeline, archive=arc, archive_date=arc_date) lanefile.save() LOGGER.info("Added %s to repository.", basefn)
def check_bam_vs_lane_fastq(self, bam, relaxed=False): ''' Quick check that the number of reads in the bam file being saved is identical to the number of (passed PF) reads in the input fastq file. Returns the number of reads in the bam file. ''' (code, facility, lanenum, _pipeline) = parse_repository_filename(bam) try: lane = Lane.objects.get(library__code=code, facility__code=facility, lanenum=lanenum) except Lane.DoesNotExist, err: LOGGER.error( "Unexpected lane in filename, not found in repository.") sys.exit("Unable to find lane in repository")
def append(fname, library=None, facility=None, lanenum=None, genome=None): ''' Given a filename, figure out where it belongs and load it into the repository. Additional hints may be provided. ''' LOGGER.info("Processing %s", fname) argcheck = [ x is not None for x in (library, facility, lanenum) ] if any(argcheck): if not all(argcheck): raise ValueError("Either use filename on its own, or all of" + " the following: library, facility, lanenum.") else: (library, facility, lanenum, _pipeline) = parse_repository_filename(fname) try: library = Library.objects.get(code=library) except Library.DoesNotExist, err: raise StandardError("No library found with code %s" % (library,))
def get_lane_for_file(fname): lane = None (code, facility, lanenum, pipeline) = parse_repository_filename(fname) lanelist = Lane.objects.filter(library__code=code, lanenum=lanenum, facility__code=facility) if len(lanelist) == 0: LOGGER.error("Could not find lane for '%s'", fname) elif len(lanelist) > 1: LOGGER.error("Found multiple lanes for '%s': %s", fname, ", ".join([x.id for x in lanelist])) else: lane = lanelist[0] if lane is None: LOGGER.error("Lane not determined! Exiting.") sys.exit(1) return lane
def find_lanes(self): '''Find list of lane(s) associated with library, lane number and facility''' if not self.library: (code, facility, lanenum, pipeline) = parse_repository_filename(self.fn_base) # Merged files have only code in their prefix meaning failure by parse_repository_filename() above. if self.merged_file or self.lane: code = self.fn_base.split('_')[0] else: code = fn if code is None or code == '': LOGGER.error("Unable to extract code from filename %s." % self.fn_base) sys.exit(1) if self.merged_file or self.lane or self.library: self.lanes = Lane.objects.filter(library__code=code) else: self.lanes = Lane.objects.filter(library__code=code, lanenum=lanenum, facility__code=facility) if len(self.lanes) == 0: LOGGER.error("No lane associated with code \'%s\'." % code) sys.exit(1)
django.setup() from osqpipe.models import Lane, Lanefile from osqpipe.pipeline.laneqc import LaneFastQCReport from osqutil.utilities import parse_repository_filename from osqutil.config import Config CONFIG = Config() for d in os.listdir('.'): if os.path.isdir(d): for r in os.listdir(d): if os.path.isdir(os.path.join(d, r)): (lib, fac, lane, _pipeline) = parse_repository_filename(r) if lib is None: print "Cannot parse dir name: %r. Skipping." continue laneid = "%s_%s%02d" % (lib, fac, lane) try: lane = Lane.objects.get(library__code=lib, facility__code=fac, lanenum=lane) except Lane.DoesNotExist, err: print "Lane not found in DB: %s. Skipping." % laneid continue if lane.laneqc_set.count() > 0: print "Lane already has QC: %s. Skipping." % laneid