def get_md5sums(self): """ Generate MD5sums Generate and return MD5 sums for the file and for the uncompressed contents. Sets the 'md5' and 'uncompressed_md5' properties on the current instance. Returns tuple (md5,md5_uncompressed_contents). """ if self.is_link or self.is_dir: # Ignore links or directories return (None, None) if self.md5 is None: # Generate MD5 sum self.md5 = Md5sum.md5sum(self.path) if self.uncompressed_md5 is None: # Generate MD5 for uncompressed contents if not self.compression: self.uncompressed_md5 = self.md5 elif self.compression == 'bz2': fp = bz2.BZ2File(self.path, 'r') self.uncompressed_md5 = Md5sum.md5sum(fp) elif self.compression == 'gz': fp = gzip.GzipFile(self.path, 'rb') self.uncompressed_md5 = Md5sum.md5sum(fp) else: logging.warning("%s: md5sums not implemented for " "compression type '%s'" % (self, self.compression)) return (self.md5, self.uncompressed_md5)
def get_md5sums(self): """ Generate MD5sums Generate and return MD5 sums for the file and for the uncompressed contents. Sets the 'md5' and 'uncompressed_md5' properties on the current instance. Returns tuple (md5,md5_uncompressed_contents). """ if self.is_link or self.is_dir: # Ignore links or directories return (None,None) if self.md5 is None: # Generate MD5 sum self.md5 = Md5sum.md5sum(self.path) if self.uncompressed_md5 is None: # Generate MD5 for uncompressed contents if not self.compression: self.uncompressed_md5 = self.md5 elif self.compression == 'bz2': fp = bz2.BZ2File(self.path,'r') self.uncompressed_md5 = Md5sum.md5sum(fp) elif self.compression == 'gz': fp = gzip.GzipFile(self.path,'rb') self.uncompressed_md5 = Md5sum.md5sum(fp) else: logging.warning("%s: md5sums not implemented for " "compression type '%s'" % (self,self.compression)) return (self.md5,self.uncompressed_md5)
def verify_md5sums(chksum_file,verbose=False): """Check the MD5 sums for all entries specified in a file For all entries in the supplied file, check the MD5 sum is the same as that calculated by the function, and report whether they match or are different. The input file can either be output from this program or from the Linux 'md5sum' program. Arguments: chksum_file: name of the file containing the MD5 sums verbose: (optional) if True then report status for all files checked, plus a summary; otherwise only report failures Returns: Zero on success, 1 if errors were encountered """ # Set up reporter object reporter = Md5sum.Md5CheckReporter(Md5sum.Md5Checker.verify_md5sums(chksum_file), verbose=verbose) # Summarise if verbose: reporter.summary() return reporter.status
def compute_md5sum_for_file(filen,output_file=None): """Compute and write MD5 sum for specifed file Computes the MD5 sum for a file, and writes the sum and the file name either to stdout or to the specified file name. Note that the output format is compatible with the Linux 'md5sum' program's '-c' option. Arguments: filen: file to compute the MD5 sum for output_file: (optional) name of file to write MD5 sum to Returns: Zero on success, 1 if errors were encountered """ retval = 1 if output_file: fp = open(output_file,'w') else: fp = sys.stdout try: chksum = Md5sum.md5sum(filen) fp.write("%s %s\n" % (chksum,filen)) except IOError, ex: # Error accessing file, report and skip logging.error("%s: error while generating MD5 sum: '%s'" % (filen,ex)) retval = 1
def write_checksums(project, pattern=None, filen=None, relative=True): """Write MD5 checksums for fastq files with an AnalysisProject Arguments: project: AnalysisProject instance pattern: if supplied then use the supplied pattern to filter fastqs based on filename filen: if supplied then checksums will be written to this file; otherwise they will be written to stdout (default) relative: if True (default) then fastq file names will be the basename; otherwise they will be the full paths. """ if filen: fp = open(md5file, 'w') else: fp = sys.stdout for sample_name, fastq, fq in get_fastqs(project, pattern=pattern): if relative: name = os.path.basename(fq) else: name = fq fp.write("%s %s\n" % (md5sum.md5sum(fq), name)) if filen: fp.close()
def diff_files(filen1,filen2,verbose=False): """Check that the MD5 sums of two files match This compares two files by computing the MD5 sums for each. Arguments: filen1: "source" file filen2: "target" file to be compared with filen1 verbose: (optional) if True then report status for all files checked; otherwise only report summary Returns: Zero on success, 1 if errors were encountered """ # Set up reporter object reporter = Md5sum.Md5CheckReporter() # Compare files reporter.add_result(filen1,Md5sum.Md5Checker.md5cmp_files(filen1,filen2)) if verbose: if reporter.n_ok: print("OK: MD5 sums match") elif reporter.n_failed: print("FAILED: MD5 sums don't match") else: print("ERROR: unable to compute one or both MD5 sums") return reporter.status
def compute_md5sum_for_file(filen,output_file=None): """Compute and write MD5 sum for specifed file Computes the MD5 sum for a file, and writes the sum and the file name either to stdout or to the specified file name. Note that the output format is compatible with the Linux 'md5sum' program's '-c' option. Arguments: filen: file to compute the MD5 sum for output_file: (optional) name of file to write MD5 sum to Returns: Zero on success, 1 if errors were encountered """ retval = 0 if output_file: fp = io.open(output_file,'wt') else: fp = sys.stdout try: chksum = Md5sum.md5sum(filen) fp.write(u"%s %s\n" % (chksum,filen)) except IOError as ex: # Error accessing file, report and skip logging.error("%s: error while generating MD5 sum: '%s'" % (filen,ex)) retval = 1 if output_file: fp.close() return retval
def write_checksums(project,pattern=None,filen=None,relative=True): """Write MD5 checksums for fastq files with an AnalysisProject Arguments: project: AnalysisProject instance pattern: if supplied then use the supplied pattern to filter fastqs based on filename filen: if supplied then checksums will be written to this file; otherwise they will be written to stdout (default) relative: if True (default) then fastq file names will be the basename; otherwise they will be the full paths. """ if filen: fp = open(md5file,'w') else: fp = sys.stdout for sample_name,fastq,fq in get_fastqs(project,pattern=pattern): if relative: name = os.path.basename(fq) else: name = fq fp.write("%s %s\n" % (md5sum.md5sum(fq),name)) if filen: fp.close()
def diff_directories(dirn1,dirn2,verbose=False): """Check one directory against another using MD5 sums This compares one directory against another by computing the MD5 sums for the contents of the first, and then checking these against the second. (Essentially this is automatically performing the compute/verify steps in a single operation.) Note that if there are different files in one directory compared with the other then this function will give different results depending on the order the directories are specified. However for common files the actual MD5 sums will be the same regardless of order. Arguments: dirn1: "source" directory dirn2: "target" directory to be compared to dirn1 verbose: (optional) if True then report status for all files checked; otherwise only report summary Returns: Zero on success, 1 if errors were encountered """ # Set up reporter object reporter = Md5sum.Md5CheckReporter(Md5sum.Md5Checker.md5cmp_dirs(dirn1,dirn2), verbose=verbose) # Summarise if verbose: reporter.summary() return reporter.status
def copy_to_dest(f, dirn, chksum=None, link=False): """Copy a file to a local or remote destination Raises an exception if the copy operation fails. If 'chksum' argument is supplied then the MD5 sum of the copy is also verified against this and an exception is raised if this fails to match. Arguments: f: file to copy (must be local) dirn: target directory, either local or of the form "[user@]host:dir" chksum: (optional) MD5 sum of the original file to match against the copy link: (optional) if True then hard link files instead of copying """ if not exists(f): raise Exception("'%s': not found" % f) if not exists(dirn): raise Exception("'%s': destination not found" % dirn) # Copy the file copy(f, dirn, link=link) if chksum is not None: user, host, dest = utils.split_user_host_dir(dirn) remote = (host is not None) if not remote: # Check local copy if chksum is not None: if md5sum.md5sum(f) != chksum: raise Exception("MD5 checksum failed for " "copy of %s" % f) else: # Check remote copy try: # Run md5sum -c on the remote system if chksum is not None: md5sum_check = applications.general.ssh_command( user, host, ('echo', '"%s %s"' % (chksum, os.path.join(dest, os.path.basename(f))), '|', 'md5sum', '-c')) print("Running %s" % md5sum_check) md5sum_check.run_subprocess() except Exception as ex: raise Exception("Failed to copy %s to %s: %s" % (f, dirn, ex))
def print_md5sums(library): """Calculate and print md5sums for primary data files in library This will generate a list of md5sums that can be passed to the md5sum program to check against a copy of the the runs using md5sum -c CHECKSUMS Arguments: library: SolidLibrary instance. """ # F3 primary data try: print "%s %s" % (Md5sum.md5sum( library.csfasta), strip_prefix(library.csfasta, os.getcwd())) except Exception, ex: logging.error("FAILED for F3 csfasta: %s" % ex)
def print_md5sums(library): """Calculate and print md5sums for primary data files in library This will generate a list of md5sums that can be passed to the md5sum program to check against a copy of the the runs using md5sum -c CHECKSUMS Arguments: library: SolidLibrary instance. """ # F3 primary data try: print "%s %s" % (Md5sum.md5sum(library.csfasta), strip_prefix(library.csfasta,os.getcwd())) except Exception,ex: logging.error("FAILED for F3 csfasta: %s" % ex)
def copy_to_dest(f,dirn,chksum=None): """Copy a file to a local or remote destination Raises an exception if the copy operation fails. If 'chksum' argument is supplied then the MD5 sum of the copy is also verified against this and an exception is raised if this fails to match. Arguments: f: file to copy (must be local) dirn: target directory, either local or of the form "[user@]host:dir" chksum: (optional) MD5 sum of the original file to match against the copy """ if not os.path.exists(f): raise Exception("File %s doesn't exist" % f) user,host,dest = utils.split_user_host_dir(dirn) remote = (host is not None) if not remote: # Local copy shutil.copy(f,dirn) if chksum is not None: if md5sum.md5sum(f) != chksum: raise Exception("MD5 checksum failed for copy of %s" % f) else: # Remote copy try: scp = applications.general.scp(user,host,f,dest) print "Running %s" % scp scp.run_subprocess() # Run md5sum -c on the remote system if chksum is not None: md5sum_check = applications.general.ssh_command( user,host, ('echo', '"%s %s"' % (chksum, os.path.join(dest,os.path.basename(f))), '|','md5sum','-c')) print "Running %s" % md5sum_check md5sum_check.run_subprocess() except Exception, ex: raise Exception("Failed to copy %s to %s: %s" % (f,dirn,ex))
def copy_to_dest(f, dirn, chksum=None): """Copy a file to a local or remote destination Raises an exception if the copy operation fails. If 'chksum' argument is supplied then the MD5 sum of the copy is also verified against this and an exception is raised if this fails to match. Arguments: f: file to copy (must be local) dirn: target directory, either local or of the form "[user@]host:dir" chksum: (optional) MD5 sum of the original file to match against the copy """ if not os.path.exists(f): raise Exception("File %s doesn't exist" % f) user, host, dest = utils.split_user_host_dir(dirn) remote = (host is not None) if not remote: # Local copy shutil.copy(f, dirn) if chksum is not None: if md5sum.md5sum(f) != chksum: raise Exception("MD5 checksum failed for copy of %s" % f) else: # Remote copy try: scp = applications.general.scp(user, host, f, dest) print "Running %s" % scp scp.run_subprocess() # Run md5sum -c on the remote system if chksum is not None: md5sum_check = applications.general.ssh_command( user, host, ('echo', '"%s %s"' % (chksum, os.path.join(dest, os.path.basename(f))), '|', 'md5sum', '-c')) print "Running %s" % md5sum_check md5sum_check.run_subprocess() except Exception, ex: raise Exception("Failed to copy %s to %s: %s" % (f, dirn, ex))
This will generate a list of md5sums that can be passed to the md5sum program to check against a copy of the the runs using md5sum -c CHECKSUMS Arguments: library: SolidLibrary instance. """ # F3 primary data try: print "%s %s" % (Md5sum.md5sum( library.csfasta), strip_prefix(library.csfasta, os.getcwd())) except Exception, ex: logging.error("FAILED for F3 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum( library.qual), strip_prefix(library.qual, os.getcwd())) except Exception, ex: logging.error("FAILED for F3 qual: %s" % ex) # F5 primary data if library.parent_sample.parent_run.is_paired_end: try: print "%s %s" % (Md5sum.md5sum(library.csfasta_f5), strip_prefix(library.csfasta_f5, os.getcwd())) except Exception, ex: logging.error("FAILED for F5 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum( library.qual_f5), strip_prefix(library.qual_f5, os.getcwd())) except Exception, ex: logging.error("FAILED for F5 qual: %s" % ex)
This will generate a list of md5sums that can be passed to the md5sum program to check against a copy of the the runs using md5sum -c CHECKSUMS Arguments: library: SolidLibrary instance. """ # F3 primary data try: print "%s %s" % (Md5sum.md5sum(library.csfasta), strip_prefix(library.csfasta,os.getcwd())) except Exception,ex: logging.error("FAILED for F3 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum(library.qual), strip_prefix(library.qual,os.getcwd())) except Exception,ex: logging.error("FAILED for F3 qual: %s" % ex) # F5 primary data if library.parent_sample.parent_run.is_paired_end: try: print "%s %s" % (Md5sum.md5sum(library.csfasta_f5), strip_prefix(library.csfasta_f5,os.getcwd())) except Exception,ex: logging.error("FAILED for F5 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum(library.qual_f5), strip_prefix(library.qual_f5,os.getcwd())) except Exception,ex: logging.error("FAILED for F5 qual: %s" % ex)
total_size = 0 for fq in fastqs: fsize = os.lstat(fq).st_size total_size += fsize print "%s\t%s" % (os.path.basename(fq), bcf_utils.format_file_size(fsize)) print "Total: %s" % bcf_utils.format_file_size(total_size) # Generate MD5 checksum file if not options.dry_run: tmpdir = tempfile.mkdtemp(suffix='checksums.md5', dir=os.getcwd()) md5_file = os.path.join(tmpdir,'checksums.md5') print "Generating MD5 sums in %s" % md5_file fp = open(md5_file,'w') for fq in fastqs: chksum = Md5sum.md5sum(fq) fp.write("%s %s\n" % (chksum,os.path.basename(fq))) fp.close() # Copy the fastqs print "Copying fastqs" for fq in fastqs: print "%s" % os.path.basename(fq) if not options.dry_run: copy_to_dest(fq,args[1]) if not options.dry_run: print "Copying MD5 checksum file" copy_to_dest(md5_file,args[1]) shutil.rmtree(tmpdir)
class ArchiveFile(utils.PathInfo): """ Class for storing information about a file """ def __init__(self, filen): """ Create and populate a new ArchiveFile instance """ utils.PathInfo.__init__(self, filen) # !!!FIXME should be able to st_size from PathInfo!!! self.size = os.lstat(filen).st_size self.timestamp = self.mtime self.ext, self.compression = get_file_extensions(filen) self.md5 = None self.uncompressed_md5 = None @property def basename(self): """ Return the basename of the file path """ return os.path.basename(self.path) @property def classifier(self): """ Return classifier for an ArchiveFile object Return an indicator consistent with 'ls -F' depending on file type: / indicates a directory @ indicates a link * indicates an executable Empty string indicates a regular file. """ if self.is_link: return '@' elif self.is_dir: return os.sep elif self.is_executable: return '*' return '' def get_md5sums(self): """ Generate MD5sums Generate and return MD5 sums for the file and for the uncompressed contents. Sets the 'md5' and 'uncompressed_md5' properties on the current instance. Returns tuple (md5,md5_uncompressed_contents). """ if self.is_link or self.is_dir: # Ignore links or directories return (None, None) if self.md5 is None: # Generate MD5 sum self.md5 = Md5sum.md5sum(self.path) if self.uncompressed_md5 is None: # Generate MD5 for uncompressed contents if not self.compression: self.uncompressed_md5 = self.md5 elif self.compression == 'bz2': fp = bz2.BZ2File(self.path, 'r') self.uncompressed_md5 = Md5sum.md5sum(fp) elif self.compression == 'gz': fp = gzip.GzipFile(self.path, 'rb') self.uncompressed_md5 = Md5sum.md5sum(fp) else: logging.warning("%s: md5sums not implemented for " "compression type '%s'" % (self, self.compression)) return (self.md5, self.uncompressed_md5) def compress(self, dry_run=False): """ Compress the file Performs compression using bzip2, and transfers the timestamp from the original file to the compressed version. If 'dry_run' is True then report the compression operation but don't report anything. Returns status: 0 indicates success -1 indicates nothing to do, no error >0 indicates an error """ if self.compression: logging.warning("%s: already compressed" % self) return -1 # Check for existing compressed file bz2file = self.path + '.bz2' if os.path.exists(bz2file): logging.warning("%s: compressed copy already exists" % self) return -1 # Get MD5 checksum self.get_md5sums() checksum = self.md5 # Capture timestamp for parent directory parent_mtime = os.lstat(os.path.dirname(self.path)).st_mtime # Compress to a temp file bzip2_cmd = applications.Command('bzip2', '-c', self.path) print bzip2_cmd if dry_run: return -1 fd, tmpbz2 = tempfile.mkstemp(dir=os.path.dirname(self.path), suffix='.bz2.tmp') # Execute the compression command try: status = bzip2_cmd.run_subprocess(log=tmpbz2) except Exception, ex: logging.error("Exception compressing %s: %s" % (self, ex)) status = 1 if status != 0: logging.error("Compression failed for %s" % self) else: # Verify the checksum for the contents of the # compressed file uncompressed_checksum = Md5sum.md5sum(bz2.BZ2File(tmpbz2, 'r')) if uncompressed_checksum == checksum: # Rename the compressed file, reset the timestamps # and remove the source os.rename(tmpbz2, bz2file) os.utime(bz2file, (self.mtime, self.mtime)) os.remove(self.path) os.utime(os.path.dirname(self.path), (parent_mtime, parent_mtime)) # Update attributes self.__path = bz2file self.__st = os.lstat(self.__path) self.compression = 'bz2' self.md5 = None else: logging.error("Bad checksum for compressed version of %s" % self) status = 1 # Remove the temp file if os.path.exists(tmpbz2): os.remove(tmpbz2) # Finish return status