def rsync_files(to_copy, logfile, group, dry): # Iterate over the files to copy and create directories and copy files as necessary successful = 0 uid = os.getuid() gid = os.getgid() if group is not None and len(group) > 0: gid = grp.getgrnam(group).gr_gid for src_file, dst_dir, dst_name in to_copy: dst_file = os.path.join(dst_dir, dst_name) print "Will copy (rsync) ", src_file, "to ", dst_file if not dry: # Create the destination directory if necessary logfile.write("[{:s}] - Creating run-level delivery directory: {:s} " \ "(or leaving it in place if already present)\n".format(utc_time(), dst_dir)) if os.path.exists(dst_dir): print("Directory {:s} already exists!".format(dst_dir)) else: try: # Create directory hierarchy with ug+rwX permissions os.makedirs(dst_dir, 0770) os.chown(dst_dir, uid, gid) except: print("Could not create run-level delivery directory!") clean_exit(1, logfile, dry) # Rsync the file across command_to_execute = ['rsync', '-ac', src_file, dst_file] logfile.write("[{:s}] - Executing command: {:s}\n".format( utc_time(), " ".join(command_to_execute))) logfile.flush() try: check_call(command_to_execute) except CalledProcessError, e: logfile.write( "[{:s}] - rsync exited with exit code {:d}\n".format( utc_time(), e.returncode)) raise e logfile.write("[{:s}] - rsync exited with exit code 0\n".format( utc_time())) successful += 1 print("{:d} of {:d} files copied successfully".format( successful, len(to_copy))) # Modify the permissions to ug+rw os.chown(dst_file, uid, gid) os.chmod(dst_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)
def __init__(self, **kw): self["_id"] = kw.get("_id", uuid4().hex) self["entity_type"] = self._entity_type self["name"] = kw.get("name", None) self["creation_time"] = kw.get("creation_time", utc_time()) self["modification_time"] = kw.get("modification_time", utc_time()) for f in self._fields: self[f] = kw.get(f, None) for f in self._dict_fields: self[f] = kw.get(f, {}) for f in self._list_fields: self[f] = kw.get(f, {}) self = _update(self, kw)
def rsync_files(to_copy, logfile, group, dry): # Iterate over the files to copy and create directories and copy files as necessary successful = 0 uid = os.getuid() gid = os.getgid() if group is not None and len(group) > 0: gid = grp.getgrnam(group).gr_gid for src_file, dst_dir, dst_name in to_copy: dst_file = os.path.join(dst_dir, dst_name) print "Will copy (rsync) ", src_file, "to ", dst_file if not dry: # Create the destination directory if necessary logfile.write("[{:s}] - Creating run-level delivery directory: {:s} " \ "(or leaving it in place if already present)\n".format(utc_time(), dst_dir)) if os.path.exists(dst_dir): print("Directory {:s} already exists!".format(dst_dir)) else: try: # Create directory hierarchy with ug+rwX permissions os.makedirs(dst_dir, 0770) os.chown(dst_dir,uid,gid) except: print("Could not create run-level delivery directory!") clean_exit(1,logfile,dry) # Rsync the file across command_to_execute = ['rsync', '-ac', src_file, dst_file] logfile.write("[{:s}] - Executing command: {:s}\n".format(utc_time(), " ".join(command_to_execute))) logfile.flush() try: check_call(command_to_execute) except CalledProcessError, e: logfile.write("[{:s}] - rsync exited with exit code {:d}\n".format(utc_time(), e.returncode)) raise e logfile.write("[{:s}] - rsync exited with exit code 0\n".format(utc_time())) successful += 1 print("{:d} of {:d} files copied successfully".format(successful,len(to_copy))) # Modify the permissions to ug+rw os.chown(dst_file,uid,gid) os.chmod(dst_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)
def update_fn(db, obj): t_utc = utc_time() def equal(a, b): a_keys = [str(x) for x in a.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]] b_keys = [str(x) for x in b.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]] keys = list(set(a_keys + b_keys)) return {k:a.get(k, None) for k in keys} == {k:b.get(k, None) for k in keys} if isinstance(obj, FlowcellRunMetrics): view = db.view("names/id_to_name") if isinstance(obj, SampleRunMetrics): view = db.view("names/id_to_name") d_view = {k.value:k for k in view} dbid = d_view.get(obj["name"], None) dbobj = None if dbid: dbobj = db.get(dbid.id, None) if dbobj is None: obj["creation_time"] = t_utc return obj if equal(obj, dbobj): return None else: obj["creation_time"] = dbobj.get("creation_time") obj["modification_time"] = t_utc obj["_rev"] = dbobj.get("_rev") obj["_id"] = dbobj.get("_id") return obj
def remove_finished(self): if not self._check_pargs(["project"]): return # Don't filter out files def filter_fn(f): return True slist = os.listdir(os.path.join(self._meta.root_path, self._meta.path_id)) for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.isdir(spath): continue if not os.path.exists(os.path.join(spath, FINISHED_FILE)): self.app.log.info("Sample {} not finished; skipping".format(s)) continue flist = filtered_walk(spath, filter_fn) dlist = filtered_walk(spath, filter_fn, get_dirs=True) if os.path.exists(os.path.join(spath, REMOVED_FILE)): self.app.log.info("Sample {} already removed; skipping".format(s)) continue if len(flist) > 0 and not query_yes_no("Will remove directory {} containing {} files; continue?".format(s, len(flist)), force=self.pargs.force): continue self.app.log.info("Removing {} files from {}".format(len(flist), spath)) for f in flist: if f == os.path.join(spath, FINISHED_FILE): continue self.app.cmd.safe_unlink(f) self.app.log.info("Removing {} directories from {}".format(len(dlist), spath)) for d in sorted(dlist, reverse=True): self.app.cmd.safe_rmdir(d) if not self.pargs.dry_run: with open(os.path.join(spath, REMOVED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def touch_finished(self): if not self._check_pargs(["project", "sample"]): return if os.path.exists(self.pargs.sample) and os.path.isfile(self.pargs.sample): with open(self.pargs.sample) as fh: slist = [x.rstrip() for x in fh.readlines()] else: slist = [self.pargs.sample] for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.exists(spath): self.app.log.warn("No such path {}; skipping".format(spath)) continue rsync_src = os.path.join(self._meta.root_path, self._meta.path_id, s) + os.sep rsync_tgt = os.path.join(self.app.config.get("runqc", "root"), self.pargs.project, s) + os.sep cl = ["rsync {} {} {}".format(self.app.config.get("runqc", "rsync_sample_opts"), rsync_src, rsync_tgt)] self.app.log.info("Checking if runqc uptodate with command '{}'".format(" ".join(cl))) out = self.app.cmd.command(cl, **{'shell':True}) if not self.pargs.dry_run and not out.find("total size is 0"): self.app.log.info("Some files need to be updated. Rsync output:") print "********" print out print "********" continue if not query_yes_no("Going to touch file {} for sample {}; continue?".format(FINISHED_FILE, s), force=self.pargs.force): continue self.app.log.info("Touching file {} for sample {}".format(FINISHED_FILE, s)) with open(os.path.join(spath, FINISHED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def update_fn(cls, db, obj, viewname = "names/id_to_name", key="name"): """Compare object with object in db if present. :param cls: calling class :param db: couch database :param obj: database object to save :returns: database object to save and database id if present """ t_utc = utc_time() def equal(a, b): a_keys = [str(x) for x in a.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]] b_keys = [str(x) for x in b.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"]] keys = list(set(a_keys + b_keys)) return {k:a.get(k, None) for k in keys} == {k:b.get(k, None) for k in keys} view = db.view(viewname) d_view = {k.value:k for k in view} dbid = d_view.get(obj[key], None) dbobj = None if dbid: dbobj = db.get(dbid.id, None) if dbobj is None: obj["creation_time"] = t_utc return (obj, dbid) if equal(obj, dbobj): return (None, dbid) else: obj["creation_time"] = dbobj.get("creation_time") obj["modification_time"] = t_utc obj["_rev"] = dbobj.get("_rev") obj["_id"] = dbobj.get("_id") return (obj, dbid)
def update_fn(cls, db, obj, viewname="names/id_to_name", key="name"): """Compare object with object in db if present. :param cls: calling class :param db: couch database :param obj: database object to save :returns: database object to save and database id if present """ t_utc = utc_time() def equal(a, b): a_keys = [ str(x) for x in a.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"] ] b_keys = [ str(x) for x in b.keys() if x not in ["_id", "_rev", "creation_time", "modification_time"] ] keys = list(set(a_keys + b_keys)) return {k: a.get(k, None) for k in keys} == {k: b.get(k, None) for k in keys} view = db.view(viewname) d_view = {k.value: k for k in view} dbid = d_view.get(obj[key], None) dbobj = None if dbid: dbobj = db.get(dbid.id, None) if dbobj is None: obj["creation_time"] = t_utc return (obj, dbid) if equal(obj, dbobj): return (None, dbid) else: # Merge the newly created object with the one found in the database, replacing # the information found in the database for the new one if found the same key merge(obj, dbobj) # We need the original times and id from the DB object though obj["creation_time"] = dbobj.get("creation_time") obj["modification_time"] = t_utc obj["_rev"] = dbobj.get("_rev") obj["_id"] = dbobj.get("_id") return (obj, dbid)
def touch_finished(self): if not self._check_pargs(["project", "sample"]): return if os.path.exists(self.pargs.sample) and os.path.isfile( self.pargs.sample): with open(self.pargs.sample) as fh: slist = [x.rstrip() for x in fh.readlines()] else: slist = [self.pargs.sample] for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.exists(spath): self.app.log.warn("No such path {}; skipping".format(spath)) continue rsync_src = os.path.join(self._meta.root_path, self._meta.path_id, s) + os.sep rsync_tgt = os.path.join(self.app.config.get("runqc", "root"), self.pargs.project, s) + os.sep cl = [ "rsync {} {} {}".format( self.app.config.get("runqc", "rsync_sample_opts"), rsync_src, rsync_tgt) ] self.app.log.info( "Checking if runqc uptodate with command '{}'".format( " ".join(cl))) out = self.app.cmd.command(cl, **{'shell': True}) if not self.pargs.dry_run and not out.find("total size is 0"): self.app.log.info( "Some files need to be updated. Rsync output:") print "********" print out print "********" continue if not query_yes_no( "Going to touch file {} for sample {}; continue?".format( FINISHED_FILE, s), force=self.pargs.force): continue self.app.log.info("Touching file {} for sample {}".format( FINISHED_FILE, s)) with open(os.path.join(spath, FINISHED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def remove_finished(self): if not self._check_pargs(["project"]): return # Don't filter out files def filter_fn(f): return True slist = os.listdir( os.path.join(self._meta.root_path, self._meta.path_id)) for s in slist: spath = os.path.join(self._meta.root_path, self._meta.path_id, s) if not os.path.isdir(spath): continue if not os.path.exists(os.path.join(spath, FINISHED_FILE)): self.app.log.info("Sample {} not finished; skipping".format(s)) continue flist = filtered_walk(spath, filter_fn) dlist = filtered_walk(spath, filter_fn, get_dirs=True) if os.path.exists(os.path.join(spath, REMOVED_FILE)): self.app.log.info( "Sample {} already removed; skipping".format(s)) continue if len(flist) > 0 and not query_yes_no( "Will remove directory {} containing {} files; continue?". format(s, len(flist)), force=self.pargs.force): continue self.app.log.info("Removing {} files from {}".format( len(flist), spath)) for f in flist: if f == os.path.join(spath, FINISHED_FILE): continue self.app.cmd.safe_unlink(f) self.app.log.info("Removing {} directories from {}".format( len(dlist), spath)) for d in sorted(dlist, reverse=True): self.app.cmd.safe_rmdir(d) if not self.pargs.dry_run: with open(os.path.join(spath, REMOVED_FILE), "w") as fh: t_utc = utc_time() fh.write(t_utc)
def main(): parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \ "The user is asked to provide a project ID, a run name, and an UPPMAX project") parser.add_argument('-c', '--casava-path', action="store", dest="caspath", default='/proj/a2010002/nobackup/illumina/', help="Specify a path to a Casava directory manually") parser.add_argument('-l', '--log-path', action="store", dest="logpath", default='/proj/a2010002/private/delivery_logs', help="Specify a path to a log file") parser.add_argument('-i', '--interactive', action="store_true", dest="interactive", default=False, help="Interactively select samples to be delivered") parser.add_argument('-d', '--dry-run', action="store_true", dest="dry", default=False, help="Dry run: nothing will be done") parser.add_argument( '-a', '--deliver-all-fcs', action="store_true", dest="deliver_all_fcs", default=False, help= "rsync samples from all flow cells. Default is to only deliver from specified flowcell" ) parser.add_argument( '-p', '--nophix', action="store_true", dest="deliver_nophix", default=False, help= "Deliver fastq files from nophix subdirectory. Default is to deliver from run directory" ) parser.add_argument('-g', '--group', action="store", dest="group", default="uppmax", help="Group membership to set on copied files") parser.add_argument('project_name', action='store', help="Project name to deliver, e.g. J.Doe_10_01") parser.add_argument('flowcell_id', action='store', help="Flowcell id to deliver, e.g. 120824_BD1915ACXX") parser.add_argument('uppmax_id', action='store', help="UPPMAX project id to deliver to, e.g. b2012001") args = parser.parse_args() if not args.project_name in os.listdir(args.caspath): print("Could not find project. Check directory listing:") for f in os.listdir(args.caspath): print(f) clean_exit(0, None, args.dry) fcid = args.flowcell_id fcid_comp = fcid.split('_') if len(fcid_comp) > 2: fcid = fcid_comp[0] + '_' + fcid_comp[-1] print("FCID format too long, trying {:s}".format(fcid)) dt = datetime.now() time_str = "_".join([ str(dt.year), str(dt.month), str(dt.day), str(dt.hour), str(dt.minute), str(dt.second) ]) logfilename = os.path.join(os.path.normpath(args.logpath), "{:s}.log".format(time_str)) if not args.dry: logfile = open(logfilename, "w") else: logfile = sys.stdout logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format( utc_time(), args.project_name)) logfile.flush() proj_base_dir = os.path.join(args.caspath, args.project_name) skip_list = [] if args.interactive: for sample_dir in os.listdir(proj_base_dir): if not os.path.isdir(os.path.join(proj_base_dir, sample_dir)): continue if not query_yes_no("Deliver sample {:s}?".format(sample_dir), default="no"): skip_list.append(sample_dir) created_proj_dir_name = fixProjName(args.project_name) del_path_top = '/proj/' + args.uppmax_id + "/INBOX/" + created_proj_dir_name to_copy = get_file_copy_list(proj_base_dir, del_path_top, fcid, args.deliver_all_fcs, args.deliver_nophix, skip_list) # Prompt user if any of the files are non-compressed for fqfile, _, _ in to_copy: if os.path.splitext(fqfile)[1] == ".gz": continue print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \ "It is recommended that you compress files prior to delivery.".format(fqfile)) if query_yes_no("Do you wish to continue delivering " \ "uncompressed fastq files?", default="yes"): break clean_exit(1, logfile, args.dry) rsync_files(to_copy, logfile, args.group, args.dry) clean_exit(0, logfile, args.dry)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name', 'NA'), k.get('flowcell', 'NA'), k.get('lane', 'NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info( "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}" .format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir, samples) if len(uncompressed) > 0: self.log.warn( "The following samples have uncompressed *.fastq files that cannot be delivered: {}" .format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format( sample.get("project_sample_name", "NA"), sample.get("flowcell", "NA"))) # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m, mfile, f[2], f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0], m)) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) # write the md5sum to a file at the destination and verify the transfer passed = True for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) # if dry-run, make sure verification pass if self.pargs.dry_run: dm = m else: dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info( "Logging delivery to StatusDB document {}".format(id)) data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { 'R{}'.format(read): { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, read, srcpath in md5 }, } } jsonstr = json.dumps(data) jsonfile = os.path.join( os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format( sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con, sample) self.log.debug(jsonstr)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error("Uppmax project was not specified and could not be fetched from project database") return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root) assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root") except Exception as e: self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir) assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root) destination_root = os.path.join(destination_root,self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples) if len(uncompressed) > 0: self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA"))) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m,mfile,f[2],f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0],m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write(mfile,"{} {}".format(m,os.path.basename(dstfile)),True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm)) if m != dm: self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm)) self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info("Logging delivery to StatusDB document {}".format(id)) data = {'raw_data_delivery': {'timestamp': utc_time(), 'files': {'R{}'.format(read):{'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath} for m, mfile, read, srcpath in md5}, } } jsonstr = json.dumps(data) jsonfile = os.path.join(os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug("Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True) self.log.debug("Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con,sample) self.log.debug(jsonstr)
def main(): parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \ "The user is asked to provide a project ID, a run name, and an UPPMAX project") parser.add_argument('-c', '--casava-path', action="store", dest="caspath", default='/proj/a2010002/nobackup/illumina/', help="Specify a path to a Casava directory manually") parser.add_argument('-l', '--log-path', action="store", dest="logpath", default='/proj/a2010002/private/delivery_logs', help="Specify a path to a log file") parser.add_argument('-i', '--interactive', action="store_true", dest="interactive", default=False, help="Interactively select samples to be delivered") parser.add_argument('-d', '--dry-run', action="store_true", dest="dry", default=False, help="Dry run: nothing will be done") parser.add_argument('-a', '--deliver-all-fcs', action="store_true", dest="deliver_all_fcs", default=False, help="rsync samples from all flow cells. Default is to only deliver from specified flowcell") parser.add_argument('-p', '--nophix', action="store_true", dest="deliver_nophix", default=False, help="Deliver fastq files from nophix subdirectory. Default is to deliver from run directory") parser.add_argument('project_name', action='store', help="Project name to deliver, e.g. J.Doe_10_01") parser.add_argument('flowcell_id', action='store', help="Flowcell id to deliver, e.g. 120824_BD1915ACXX") parser.add_argument('uppmax_id', action='store', help="UPPMAX project id to deliver to, e.g. b2012001") args = parser.parse_args() if not args.project_name in os.listdir(args.caspath): print("Could not find project. Check directory listing:") for f in os.listdir(args.caspath): print(f) clean_exit(0,None,args.dry) fcid = args.flowcell_id fcid_comp = fcid.split('_') if len(fcid_comp) > 2: fcid = fcid_comp[0] + '_' + fcid_comp[-1] print("FCID format too long, trying {:s}".format(fcid)) dt = datetime.now() time_str = "_".join([str(dt.year), str(dt.month), str(dt.day), str(dt.hour), str(dt.minute), str(dt.second)]) logfilename = os.path.join(os.path.normpath(args.logpath),"{:s}.log".format(time_str)) if not args.dry: logfile = open(logfilename, "w") else: logfile = sys.stdout logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format(utc_time(), args.project_name)) logfile.flush() proj_base_dir = os.path.join(args.caspath, args.project_name) skip_list = [] if args.interactive: for sample_dir in os.listdir(proj_base_dir): if not os.path.isdir(os.path.join(proj_base_dir,sample_dir)): continue if not query_yes_no("Deliver sample {:s}?".format(sample_dir), default="no"): skip_list.append(sample_dir) created_proj_dir_name = fixProjName(args.project_name) del_path_top = '/proj/' + args.uppmax_id + "/INBOX/" + created_proj_dir_name to_copy = get_file_copy_list(proj_base_dir, del_path_top, fcid, args.deliver_all_fcs, args.deliver_nophix, skip_list) # Prompt user if any of the files are non-compressed for fqfile, _, _ in to_copy: if os.path.splitext(fqfile)[1] == ".gz": continue print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \ "It is recommended that you compress files prior to delivery.".format(fqfile)) if query_yes_no("Do you wish to continue delivering " \ "uncompressed fastq files?", default="yes"): break clean_exit(1,logfile,args.dry) rsync_files(to_copy, logfile, args.dry) clean_exit(0,logfile,args.dry)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project database" self.log.debug("Connecting to flowcell database") f_con = FlowcellRunMetricsConnection(**vars(self.pargs)) assert f_con, "Could not get connection to flowcell database" self.log.debug("Connecting to x_flowcell database") x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs)) assert x_con, "Could not get connection to x_flowcell database" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Setup paths and verify parameters self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get( "production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir=proj_base_dir, sample=self.pargs.sample, flowcell=self.pargs.flowcell) if len(uncompressed) > 0: self.log.error( "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery" ) return # Extract the list of samples and runs associated with the project and sort them samples = self.samples_to_copy( pid=p_con.get_entry(self.pargs.project, "project_id"), pod=p_con.get_entry(self.pargs.project, "open_date"), fc_dict={ 'HiSeq2500': f_con.proj_list, 'HiSeqX': x_con.proj_list }, proj_base_dir=proj_base_dir, destination_root=destination_root, sample=self.pargs.sample, flowcell=self.pargs.flowcell) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = {} for sample in samples: if query_yes_no("Deliver sample {} ?".format(sample), default="no"): to_process[sample] = samples[sample] samples = to_process if self.pargs.sample: sample = samples.get(self.pargs.sample) if not sample: self.log.error( "There is no such sample {} for project {}".format( self.pargs.sample, self.pargs.project)) return samples = {self.pargs.sample: sample} self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample for sample, flowcells in samples.iteritems(): for fc, files in flowcells.iteritems(): self.log.info("Processing sample {} and flowcell {}".format( sample, fc)) # transfer files self.log.debug("Transferring {} fastq files".format( len(files['src']))) self._transfer_files(sources=files['src'], targets=files['dst']) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for s, d in zip(files['src'], files['dst']): m = md5sum(s) mfile = "{}.md5".format(d) md5.append([m, mfile, s]) self.log.debug("md5sum for source file {}: {}".format( s, m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug( "Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug( "Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug( "md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { os.path.splitext( (os.path.basename(srcpath)))[0]: { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, srcpath in md5 } } } jsonstr = json.dumps(data) jsonfile = os.path.join( proj_base_dir, sample, fc, "{}_{}_raw_data_delivery.json".format(sample, fc)) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) if self.proj_flowcells[fc]['type'] == 'HiSeqX': fc_con = x_con else: fc_con = f_con fc_obj = fc_con.get_entry(fc) self.log.info( "Logging delivery to StatusDB document {}".format( fc_obj.get('_id'))) fc_raw_data = fc_obj.get('raw_data_delivery', {}) fc_raw_data.update(data['raw_data_delivery']) fc_obj['raw_data_delivery'] = fc_raw_data self._save(fc_con, fc_obj) self.log.debug(jsonstr)