def make_bed_graph(self, aln): ''' Code wrapper for makeWiggle. ''' aln = Alignment.objects.get( id=aln.id) # Reload passed object within transaction. bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude( filename__contains='chr21')[0] # Note makeWiggle can read gzipped bed files directly; we use that fact here. lib = aln.lane.library bedFN = bed.repository_file_path # Write to local directory first. bgrBASE = os.path.splitext(bed.filename)[0] bgrFN = bgrBASE + self.bgrtype.suffix cmd = BED2BGR % (quote(bedFN), quote(bgrBASE)) LOGGER.debug(cmd) if not self.testMode: call_subprocess(cmd, shell=True, path=self.conf.hostpath) if not os.path.exists(bgrFN): LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, )) else: chksum = checksum_file(bgrFN) bgr = Alnfile(filename=os.path.basename(bgrFN), checksum=chksum, filetype=self.bgrtype, alignment=aln) bgrFN = rezip_file(bgrFN) move(bgrFN, bgr.repository_file_path) set_file_permissions(self.conf.group, bgr.repository_file_path) bgr.save()
def _check_file_zipped(self, fname, fobj): # Logging currently handled by the utilities module. zipped = is_zipped(fname) if fobj.filetype.gzip and not zipped: fname = rezip_file(fname, overwrite=True) elif not fobj.filetype.gzip and zipped: fname = unzip_file(fname, overwrite=True) return fname
def run_qc(fnames, workdir, destination=None, cleanup=True, register=False): with LaneFastQCReport(fastqs=fnames, workdir=workdir, lane=0) as qc: # Generate qc reports qc.run_fastqc(qc.fastqs) qc.postprocess_results(qc.fastqs) # create list of disk files and if needed compress some of them before. dfiles = [] # NB! This is not elegant, a better way of doing it would be if ftype.gzip and os.path.splitext(fname)[1] != CONFIG.gzsuffix:, # However, this code is set up no to directly interact with database. for fn in qc.output_files: if fn.endswith('txt') or fn.endswith('tar'): dfn = rezip_file(fn) dfiles.append(dfn) else: dfiles.append(fn) if destination is not None: # transfer files to destination for dfn in dfiles: # set permissions set_file_permissions(CONFIG.group, dfn) # transfer file transfer_file(dfn, destination) if register: # register QC files in repository argslist = [] for (fn, md5) in zip(qc.output_files, qc.output_md5s): argslist.append(os.path.basename(fn)) argslist.append(md5) # register files in repository cmd = "cs_addFile.py --qcfile -M --program_name %s " % qc.program_name cmd += " ".join(argslist) print "Executing \"%s\" ..." % cmd subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) (stdout, stderr) = subproc.communicate() retcode = subproc.wait() if stdout: sys.stdout.write(stdout) if stderr: sys.stderr.write(stderr) if cleanup: # remove local files # assuming fastqc report dir is still around, construct dirname. # NB! A cleaner way would be to save the dir name to self.bpath in postprocess_results in LaneQCReport class and use this value. # Even better, perhaps LaneFastQCReport should be implemented to keep track of all temporary files it creates. for dfn in dfiles: os.remove(dfn) if dfn.endswith('pdf'): fqc_dirname = os.path.splitext(dfn)[0] rmtree(fqc_dirname) zipfile = fqc_dirname + '.zip' os.remove(zipfile)
def add(self, files, final_status=None): ''' Process a list of filenames (files must exist on disk). The optional final_status argument specifies a models.Status object to which the lane should be linked upon completion. ''' # We need at least one bed file. bed = self.identify_bed_file(files) if not bed: raise ValueError("Unable to identify any bed files in the input.") # Find the appropriate alignment. Note that aln is not yet saved # in the database. (aln, lane) = self.aln_from_bedfile(bed) # Do some heavy lifting *outside* of our database transaction, to # avoid locking the db for extended periods. chksums = dict() processed = [] for fname in files: # If the file is uncompressed, don't waste time zipping it prior # to checksum. chksums[fname] = checksum_file(fname) # also works on zipped file ftype = Filetype.objects.guess_type(fname) if ftype is None: raise ValueError("File type not recognised from database: %s" % fname) if ftype.gzip and not is_zipped(fname): fname = rezip_file(fname) processed.append(fname) # All database changes should be handled by the # transaction-embedded method below. self._save_to_repository(processed, chksums, aln, final_status) return aln
def add_bam_to_lane(self, bam, lane, tc1=False, chrom_sizes=None): ''' Generate a bed file from a bam file and add both to the given lane. This method is typically used from within an ipython shell to handle unusual cases outside the main pipeline. Note that genome and data provenance info is passed in via the class attributes prog and params. ''' bam_to_bed = BamToBedConverter(tc1=tc1, chrom_sizes=chrom_sizes) base = os.path.splitext(bam)[0] bedtype = Filetype.objects.get(code='bed') bed_fn = base + bedtype.suffix beds = bam_to_bed.convert(bam, bed_fn) chksums = dict( (fname, checksum_file(fname)) for fname in [bam] + beds ) # First bed file is the main one. aln = self._create_alignment(beds[0], lane) if bedtype.gzip: bedgz = [ rezip_file(bed) for bed in beds ] self._save_to_repository([bam] + bedgz, chksums, aln)
# Set group ownership and permissions appropriately grp = self.conf.group set_file_permissions(grp, in_fn) for bed in beds: set_file_permissions(grp, bed) for wig in wigs: set_file_permissions(grp, wig) for bgr in bedgraphs: set_file_permissions(grp, bgr) for bwig in bigwigs: set_file_permissions(grp, bwig) # compress bed file(s) bedgz = [] for bed in beds: gzname = rezip_file(bed) bedgz.append(gzname) # compress wiggle files wigsgz = [] for wig in wigs: gzname = rezip_file(wig) wigsgz.append(gzname) # compress bedgraph files bgrgz = [] for bgr in bedgraphs: gzname = rezip_file(bgr) bgrgz.append(gzname) # Don't compress bigwig files
def run(self, flowcell, flowlane=None, fcq=None, destdir=None): '''The main entry point for the class.''' multiplexed = {} if destdir is None: destdir = self.conf.incoming # get list of new lanes from flow cell if fcq is None: fcq = FlowCellQuery(flowcell, flowlane, lims=self.lims, trust_lims_adapters=self.trust_lims_adapters) flowlanes = set() if fcq.lims_fc.analysis_status not in self.ready: LOGGER.info("flow cell status '%s'", fcq.lims_fc.analysis_status) sys.exit("Flow cell analysis status not yet completed.") for (lanenum, libset) in fcq.lane_library.items(): if lanenum not in multiplexed: multiplexed[lanenum] = set() for lib in libset: if fcq.lib_status[lib] in ('new') or not self.db_library_check: # Only register lane for demultiplexing if this if lib not # in lane.lims_samples() if not fcq.lane_demuxed[lanenum]: multiplexed[lanenum].add(lib) flowlanes.add((fcq.lims_fc.fcid, lanenum)) if len(flowlanes) == 0: LOGGER.info("No ready lanes for flowcell '%s'", flowcell) sys.exit("No lanes to process.") # We need to set our working directory to something suitable # before we start; otherwise we end up demuxing into a home # directory or similar. pwd = os.getcwd() os.chdir(destdir) downloading = Status.objects.get(code='downloading data') downloaded = Status.objects.get(code='downloaded') # for each lane... path = destdir for (flowcell, flowlane) in flowlanes: # Mark our lane(s) as active (note that each library has its own # version of this lane). for lane in Lane.objects.filter(flowcell=flowcell, flowlane=flowlane): lane.status = downloading lane.save() # retrieve file fetcher = FQFileFetcher(destination=path, lims=self.lims, test_mode=self.test_mode, unprocessed_only=True, force_download=self.force_download) fetcher.fetch(flowcell, flowlane) if self.test_mode: print("Test Mode: skipping download of %s lane %s to %s" % (flowcell, flowlane, path)) continue failed_fnames = {} for fname in fetcher.targets: if len(fname) > 0: # Check file was retrieved. if not os.path.exists(fname): LOGGER.error("Can't seem to find expected file '%s'", fname) failed_fnames[fname] = fname else: muxed_libs = multiplexed[flowlane] if len(muxed_libs) > 1: # Demultiplex file if required. Here we unfortunately # have to unzip the data, and we will rezip it # following the process regardless of its input state. if is_zipped(fname): fname = unzip_file(fname) LOGGER.info( "Demultiplexing file %s for libraries: %s", fname, ", ".join(muxed_libs)) self.demultiplex(muxed_libs, fname) for lib in muxed_libs: self.output_files += [ rezip_file(dmf) for dmf in self._demux_files[lib] ] else: LOGGER.info( "File does not require demultiplexing: %s", fname) self.output_files.append(fname) for fname in self.output_files: if fname not in failed_fnames: # The next line will parse regular Fastq filenames or the 10X tarball filenames. (code, flowcell, flowlane, flowpair) = parse_incoming_fastq_name(os.path.basename(fname), ext=r'.(fq.gz|tar)') LOGGER.info( "Changing code=%s, flowcell=%s, flowlane=%s, flowpair=%s to 'downloaded'", code, flowcell, flowlane, flowpair) try: lane = Lane.objects.get(flowcell=flowcell, flowlane=flowlane, library__code=code) lane.status = downloaded lane.save() except Lane.DoesNotExist, _err: try: lib = Library.objects.search_by_name(code) except Library.DoesNotExist, _err: LOGGER.error( "No library %s. Unable to register lane for the library.", code) continue LOGGER.info("Registering lane for %s.", fname) facobj = Facility.objects.get(code='CRI') machine_obj = Machine.objects.get( code__iexact=str('Unknown')) lane = Lane(facility=facobj, library=lib, flowcell=flowcell, flowlane=flowlane, lanenum=Lane.objects.next_lane_number(lib), status=downloaded, rundate='2008-01-01', paired=False, genomicssampleid='', usersampleid=code, runnumber='', seqsamplepf='', seqsamplebad='', failed=False, machine=machine_obj) lane.save()
def insert_into_repository(self, move_files=True): '''Insert self.output_files into the database.''' if len(self.output_files) == 0: self.generate() params = {self.target_name: self.target} qcobj = self.data_process.objects.create(**params) DataProvenance.objects.create(program=self._dbprog, parameters=self.program_params, rank_index=1, data_process=qcobj) for i in range(len(self.output_files)): fname = self.output_files[i] if len(self.output_md5s) != len(self.output_files): checksum = None else: checksum = self.output_md5s[i] LOGGER.info("Inserting %s", fname) # Note: this will fail if multiple types match. ftype = Filetype.objects.guess_type(fname) if os.path.isabs(fname): fpath = fname else: fpath = os.path.join(self.workdir, fname) if checksum is None or checksum == '': checksum = checksum_file(fpath) fparms = { self.file_target_name: qcobj, 'filename': os.path.split(fname)[1], 'checksum': checksum, 'filetype': ftype } fobj = self.data_file(**fparms) fobj.save() if move_files: # Zip up the file if necessary. if ftype.gzip and os.path.splitext( fname)[1] != CONFIG.gzsuffix: fpath = rezip_file(fpath) if self.move_files: dest = fobj.repository_file_path # destdir = os.path.dirname(dest) # if not os.path.exists(destdir): # os.makedirs(destdir) # move(fpath, dest) # set_file_permissions(CONFIG.group, dest) if os.path.isabs(dest): dest = os.path.split(dest)[0] + '/' transfer_file( fpath, "%s@%s:%s" % (CONFIG.user, CONFIG.datahost, dest), set_ownership=True ) # note that transfer_file sets destination file permissions as in CONF