def print_md5sums(solid_runs): """Calculate and print md5sums for primary data files This will generate a list of md5sums that can be passed to the md5sum program to check against a copy of the the runs using md5sum -c CHECKSUMS Arguments: solid_runs: list or tuple of SolidRun instances. """ for run in solid_runs: for sample in run.samples: for library in sample.libraries: try: print "%s %s" % (Md5sum.md5sum(library.csfasta), strip_prefix(library.csfasta,os.getcwd())) except Exception,ex: logging.error("FAILED for F3 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum(library.qual), strip_prefix(library.qual,os.getcwd())) except Exception,ex: logging.error("FAILED for F3 qual: %s" % ex) if SolidData.is_paired_end(run): try: print "%s %s" % (Md5sum.md5sum(library.csfasta_f5), strip_prefix(library.csfasta_f5,os.getcwd())) except Exception,ex: logging.error("FAILED for F5 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum(library.qual_f5), strip_prefix(library.qual_f5,os.getcwd())) except Exception,ex: logging.error("FAILED for F5 qual: %s" % ex)
def report_run(solid_runs): """Print a brief report about SOLiD runs. This generates a brief screen report about the content of the supplied SOLiD runs e.g. flow cells, layout, number of samples etc. Arguments: solid_runs: a list or tuple of SolidRun objects to report. """ # Report the data for each run for run in solid_runs: # Report overall slide layout slide_layout = run.slideLayout() title = "Flow Cell %s (%s)" % (str(run.run_info.flow_cell), str(slide_layout)) title = title + '\n' + "="*len(title) print title print "I.D. : %s" % (run.run_info.name) print "Date : %s" % (run.run_info.date) print "Samples: %d\n" % len(run.samples) if SolidData.is_paired_end(run): print "Paired-end run\n" # # Report projects for each sample for sample in run.samples: title = "Sample %s" % sample title = title + '\n' + "-"*len(title) print title for project in sample.projects: libraries = project.prettyPrintLibraries() title = "Project %s: %s (%d libraries)" % (project.name, libraries, len(project.libraries)) title = '\n' + title + '\n' + "-"*len(title) print title print "Pattern: %s/%s" % (sample,project.getLibraryNamePattern()) # Report location of primary data for library in project.libraries: print "%s\n%s" % (library.csfasta,library.qual) if SolidData.is_paired_end(run): print "%s\n%s" % (library.csfasta_f5,library.qual_f5)
def report_run(solid_runs): """Print a brief report about SOLiD runs. This generates a brief screen report about the content of the supplied SOLiD runs e.g. flow cells, layout, number of samples etc. Arguments: solid_runs: a list or tuple of SolidRun objects to report. """ # Report the data for each run for run in solid_runs: # Report overall slide layout slide_layout = run.slideLayout() title = "Flow Cell %s (%s)" % (str( run.run_info.flow_cell), str(slide_layout)) title = title + '\n' + "=" * len(title) print title print "I.D. : %s" % (run.run_info.name) print "Date : %s" % (run.run_info.date) print "Samples: %d\n" % len(run.samples) if SolidData.is_paired_end(run): print "Paired-end run\n" # # Report projects for each sample for sample in run.samples: title = "Sample %s" % sample title = title + '\n' + "-" * len(title) print title for project in sample.projects: libraries = project.prettyPrintLibraries() title = "Project %s: %s (%d libraries)" % ( project.name, libraries, len(project.libraries)) title = '\n' + title + '\n' + "-" * len(title) print title print "Pattern: %s/%s" % (sample, project.getLibraryNamePattern()) # Report location of primary data for library in project.libraries: print "%s\n%s" % (library.csfasta, library.qual) if SolidData.is_paired_end(run): print "%s\n%s" % (library.csfasta_f5, library.qual_f5)
def __full_names(self, library, F5): """Internal: link names based on 'full' naming scheme """ run = library.parent_sample.parent_run if not SolidData.is_paired_end(run): return (os.path.basename(library.csfasta), os.path.basename(library.qual)) else: if not F5: return (os.path.basename(library.csfasta), os.path.basename(library.qual)) else: return (os.path.basename(library.csfasta_f5), os.path.basename(library.qual_f5))
def __partial_names(self, library, F5): """Internal: link names based on 'partial' naming scheme """ run = library.parent_sample.parent_run name = "_".join([run.run_info.instrument, run.run_info.datestamp, library.name]) if not SolidData.is_paired_end(run): return ("%s.csfasta" % name, "%s_QV.qual" % name) else: # Add F3/F5 to distinguish the samples if not F5: return ("%s_F3.csfasta" % name, "%s_F3_QV.qual" % name) else: return ("%s_F5.csfasta" % name, "%s_F5_QV.qual" % name)
def __minimal_names(self, library, F5): """Internal: link names based on 'minimal' naming scheme """ # Alternative naming schemes for primary data for links run = library.parent_sample.parent_run if not SolidData.is_paired_end(run): # Library names alone return ("%s.csfasta" % library.name, "%s.qual" % library.name) else: # Add F3/F5 to distinguish the samples if not F5: return ("%s_F3.csfasta" % library.name, "%s_F3.qual" % library.name) else: return ("%s_F5.csfasta" % library.name, "%s_F5.qual" % library.name)
def __partial_names(self, library, F5): """Internal: link names based on 'partial' naming scheme """ run = library.parent_sample.parent_run name = '_'.join( [run.run_info.instrument, run.run_info.datestamp, library.name]) if not SolidData.is_paired_end(run): return ("%s.csfasta" % name, "%s_QV.qual" % name) else: # Add F3/F5 to distinguish the samples if not F5: return ("%s_F3.csfasta" % name, "%s_F3_QV.qual" % name) else: return ("%s_F5.csfasta" % name, "%s_F5_QV.qual" % name)
def print_md5sums(solid_runs): """Calculate and print md5sums for primary data files This will generate a list of md5sums that can be passed to the md5sum program to check against a copy of the the runs using md5sum -c CHECKSUMS Arguments: solid_runs: list or tuple of SolidRun instances. """ for run in solid_runs: for sample in run.samples: for library in sample.libraries: try: print "%s %s" % (Md5sum.md5sum(library.csfasta), strip_prefix(library.csfasta, os.getcwd())) except Exception, ex: logging.error("FAILED for F3 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum( library.qual), strip_prefix(library.qual, os.getcwd())) except Exception, ex: logging.error("FAILED for F3 qual: %s" % ex) if SolidData.is_paired_end(run): try: print "%s %s" % (Md5sum.md5sum(library.csfasta_f5), strip_prefix(library.csfasta_f5, os.getcwd())) except Exception, ex: logging.error("FAILED for F5 csfasta: %s" % ex) try: print "%s %s" % (Md5sum.md5sum(library.qual_f5), strip_prefix(library.qual_f5, os.getcwd())) except Exception, ex: logging.error("FAILED for F5 qual: %s" % ex)
def buildAnalysisDirs(self, top_dir=None, dry_run=False, link_type="relative", naming_scheme="partial"): """Construct and populate analysis directories for the experiments For each defined experiment, create the required analysis directories and populate with links to the primary data files. Arguments: top_dir: if set then create the analysis directories as subdirs of the specified directory; otherwise operate in cwd dry_run: if True then only report the mkdir, ln etc operations that would be performed. Default is False (do perform the operations). link_type: type of link to use when linking to primary data, one of 'relative' or 'absolute'. naming_scheme: naming scheme to use for links to primary data, one of 'full' (same names as primary data files), 'partial' (cut-down version of the full name which excludes sample names - the default), or 'minimal' (just the library name). """ # Deal with top_dir if top_dir: if os.path.exists(top_dir): print "Directory %s already exists" % top_dir else: if not dry_run: # Create top directory print "Creating %s" % top_dir utils.mkdir(top_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % top_dir # Type of link if link_type == "absolute": use_relative_links = False else: use_relative_links = True # For each experiment, make and populate directory for expt in self.experiments: print "Experiment: %s %s %s/%s" % (expt.name, expt.type, expt.sample, expt.library) expt_dir = expt.dirname(top_dir) print "\tDir: %s" % expt_dir # Make directory if os.path.exists(expt_dir): logging.warning("Directory %s already exists" % expt_dir) else: if not dry_run: # Create directory utils.mkdir(expt_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % expt_dir # Locate the primary data for run in self.solid_runs: paired_end = SolidData.is_paired_end(run) libraries = run.fetchLibraries(expt.sample, expt.library) for library in libraries: # Get names for links to primary data - F3 ln_csfasta, ln_qual = LinkNames(naming_scheme).names(library) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to primary data try: self.__linkToFile( library.csfasta, os.path.join(expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run, ) self.__linkToFile( library.qual, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run ) except Exception, ex: logging.error("Failed to link to some or all F3 primary data") logging.error("Exception: %s" % ex) # Get names for links to F5 reads (if paired-end run) if paired_end: ln_csfasta, ln_qual = LinkNames(naming_scheme).names(library, F5=True) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to F5 read data try: self.__linkToFile( library.csfasta_f5, os.path.join(expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run, ) self.__linkToFile( library.qual_f5, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run, ) except Exception, ex: logging.error("Failed to link to some or all F5 primary data") logging.error("Exception: %s" % ex)
def buildAnalysisDirs(self, top_dir=None, dry_run=False, link_type="relative", naming_scheme="partial"): """Construct and populate analysis directories for the experiments For each defined experiment, create the required analysis directories and populate with links to the primary data files. Arguments: top_dir: if set then create the analysis directories as subdirs of the specified directory; otherwise operate in cwd dry_run: if True then only report the mkdir, ln etc operations that would be performed. Default is False (do perform the operations). link_type: type of link to use when linking to primary data, one of 'relative' or 'absolute'. naming_scheme: naming scheme to use for links to primary data, one of 'full' (same names as primary data files), 'partial' (cut-down version of the full name which excludes sample names - the default), or 'minimal' (just the library name). """ # Deal with top_dir if top_dir: if os.path.exists(top_dir): print "Directory %s already exists" % top_dir else: if not dry_run: # Create top directory print "Creating %s" % top_dir bcf_utils.mkdir(top_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % top_dir # Type of link if link_type == 'absolute': use_relative_links = False else: use_relative_links = True # For each experiment, make and populate directory for expt in self.experiments: print "Experiment: %s %s %s/%s" % (expt.name, expt.type, expt.sample, expt.library) expt_dir = expt.dirname(top_dir) print "\tDir: %s" % expt_dir # Make directory if os.path.exists(expt_dir): logging.warning("Directory %s already exists" % expt_dir) else: if not dry_run: # Create directory bcf_utils.mkdir(expt_dir, mode=0775) else: # Report what would have been done print "mkdir %s" % expt_dir # Locate the primary data for run in self.solid_runs: paired_end = SolidData.is_paired_end(run) libraries = run.fetchLibraries(expt.sample, expt.library) for library in libraries: # Get names for links to primary data - F3 ln_csfasta, ln_qual = LinkNames(naming_scheme).names( library) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to primary data try: self.__linkToFile(library.csfasta, os.path.join(expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run) self.__linkToFile(library.qual, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run) except Exception, ex: logging.error( "Failed to link to some or all F3 primary data") logging.error("Exception: %s" % ex) # Get names for links to F5 reads (if paired-end run) if paired_end: ln_csfasta, ln_qual = LinkNames(naming_scheme).names( library, F5=True) print "\t\t%s" % ln_csfasta print "\t\t%s" % ln_qual # Make links to F5 read data try: self.__linkToFile(library.csfasta_f5, os.path.join( expt_dir, ln_csfasta), relative=use_relative_links, dry_run=dry_run) self.__linkToFile(library.qual_f5, os.path.join(expt_dir, ln_qual), relative=use_relative_links, dry_run=dry_run) except Exception, ex: logging.error( "Failed to link to some or all F5 primary data" ) logging.error("Exception: %s" % ex)
def verify_runs(solid_dirs): """Do basic verification checks on SOLiD run directories For each SOLiD run directory, create a SolidRun object and check for the expected sample and library directories, and that primary data files (csfasta and qual) have been assigned and exist. Returns a UNIX-like status code: 0 indicates that the checks passed, 1 indicates that they failed. Arguments: solid_dirs: a list of SOLiD sequencing directory names. Returns: 0 if the run is verified, 1 if there is a problem. """ print "Performing verification" status = 0 for solid_dir in solid_dirs: # Initialise run_status = 0 run = SolidData.SolidRun(solid_dir) if not run: # Some error processing the basics run_status = 1 else: # Check basic parameters: should have non-zero numbers of # samples and libraries if len(run.samples) == 0: print "No sample data" run_status = 1 # Determine if run is paired-end paired_end = SolidData.is_paired_end(run) # Check libraries in each sample for sample in run.samples: if len(sample.libraries) == 0: print "No libraries for sample %s" % sample.name run_status = 1 for library in sample.libraries: # Check csfasta was found if not library.csfasta: print "No F3 csfasta for %s/%s" % \ (sample.name,library.name) run_status = 1 else: if not os.path.exists(library.csfasta): print "Missing F3 csfasta for %s/%s" % \ (sample.name,library.name) run_status = 1 # Check qual was found if not library.qual: print "No F3 qual for %s/%s" % \ (sample.name,library.name) run_status = 1 else: if not os.path.exists(library.qual): print "Missing F3 qual for %s/%s" % \ (sample.name,library.name) run_status = 1 # Paired-end run: check F5 reads if paired_end: if not library.csfasta_f5: print "No F5 csfasta for %s/%s" % \ (sample.name,library.name) run_status = 1 else: if not os.path.exists(library.csfasta_f5): print "Missing F5 csfasta for %s/%s" % \ (sample.name,library.name) run_status = 1 # Check for F5 qual if not library.qual_f5: print "No F5 qual for %s/%s" % \ (sample.name,library.name) run_status = 1 else: if not os.path.exists(library.qual_f5): print "Missing F5 qual for %s/%s" % \ (sample.name,library.name) run_status = 1 # Completed checks for run print "%s:" % run.run_name, if run_status == 0: print " [PASSED]" else: print " [FAILED]" status = 1 # Completed print "\nOverall status:", if status == 0: print " [PASSED]" else: print " [FAILED]" return status