def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error("Uppmax project was not specified and could not be fetched from project database") return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root) assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root") except Exception as e: self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir) assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root) destination_root = os.path.join(destination_root,self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples) if len(uncompressed) > 0: self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA"))) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m,mfile,f[2],f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0],m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write(mfile,"{} {}".format(m,os.path.basename(dstfile)),True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm)) if m != dm: self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm)) self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info("Logging delivery to StatusDB document {}".format(id)) data = {'raw_data_delivery': {'timestamp': utc_time(), 'files': {'R{}'.format(read):{'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath} for m, mfile, read, srcpath in md5}, } } jsonstr = json.dumps(data) jsonfile = os.path.join(os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug("Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True) self.log.debug("Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con,sample) self.log.debug(jsonstr)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name', 'NA'), k.get('flowcell', 'NA'), k.get('lane', 'NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info( "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}" .format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir, samples) if len(uncompressed) > 0: self.log.warn( "The following samples have uncompressed *.fastq files that cannot be delivered: {}" .format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format( sample.get("project_sample_name", "NA"), sample.get("flowcell", "NA"))) # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m, mfile, f[2], f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0], m)) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) # write the md5sum to a file at the destination and verify the transfer passed = True for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) # if dry-run, make sure verification pass if self.pargs.dry_run: dm = m else: dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info( "Logging delivery to StatusDB document {}".format(id)) data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { 'R{}'.format(read): { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, read, srcpath in md5 }, } } jsonstr = json.dumps(data) jsonfile = os.path.join( os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format( sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con, sample) self.log.debug(jsonstr)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project database" self.log.debug("Connecting to flowcell database") f_con = FlowcellRunMetricsConnection(**vars(self.pargs)) assert f_con, "Could not get connection to flowcell database" self.log.debug("Connecting to x_flowcell database") x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs)) assert x_con, "Could not get connection to x_flowcell database" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Setup paths and verify parameters self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get( "production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir=proj_base_dir, sample=self.pargs.sample, flowcell=self.pargs.flowcell) if len(uncompressed) > 0: self.log.error( "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery" ) return # Extract the list of samples and runs associated with the project and sort them samples = self.samples_to_copy( pid=p_con.get_entry(self.pargs.project, "project_id"), pod=p_con.get_entry(self.pargs.project, "open_date"), fc_dict={ 'HiSeq2500': f_con.proj_list, 'HiSeqX': x_con.proj_list }, proj_base_dir=proj_base_dir, destination_root=destination_root, sample=self.pargs.sample, flowcell=self.pargs.flowcell) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = {} for sample in samples: if query_yes_no("Deliver sample {} ?".format(sample), default="no"): to_process[sample] = samples[sample] samples = to_process if self.pargs.sample: sample = samples.get(self.pargs.sample) if not sample: self.log.error( "There is no such sample {} for project {}".format( self.pargs.sample, self.pargs.project)) return samples = {self.pargs.sample: sample} self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample for sample, flowcells in samples.iteritems(): for fc, files in flowcells.iteritems(): self.log.info("Processing sample {} and flowcell {}".format( sample, fc)) # transfer files self.log.debug("Transferring {} fastq files".format( len(files['src']))) self._transfer_files(sources=files['src'], targets=files['dst']) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for s, d in zip(files['src'], files['dst']): m = md5sum(s) mfile = "{}.md5".format(d) md5.append([m, mfile, s]) self.log.debug("md5sum for source file {}: {}".format( s, m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug( "Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug( "Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug( "md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { os.path.splitext( (os.path.basename(srcpath)))[0]: { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, srcpath in md5 } } } jsonstr = json.dumps(data) jsonfile = os.path.join( proj_base_dir, sample, fc, "{}_{}_raw_data_delivery.json".format(sample, fc)) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) if self.proj_flowcells[fc]['type'] == 'HiSeqX': fc_con = x_con else: fc_con = f_con fc_obj = fc_con.get_entry(fc) self.log.info( "Logging delivery to StatusDB document {}".format( fc_obj.get('_id'))) fc_raw_data = fc_obj.get('raw_data_delivery', {}) fc_raw_data.update(data['raw_data_delivery']) fc_obj['raw_data_delivery'] = fc_raw_data self._save(fc_con, fc_obj) self.log.debug(jsonstr)