示例#1
0
 def setup(self):
     # Check the QC protocol
     qc_info = self.args.project.qc_info(self.args.qc_dir)
     stored_protocol = qc_info.protocol
     if stored_protocol is not None and \
        stored_protocol != self.args.qc_protocol:
         logger.warning("QC protocol mismatch for %s: "
                        "'%s' stored, '%s' specified"
                        % (self.project.name,
                           stored_protocol,
                           self.args.protocol))
         logger.warning("Stored protocol will be ignored")
     # Set up QC dir
     if not os.path.exists(self.args.qc_dir):
         mkdir(self.args.qc_dir)
     # Set up log dir
     if self.args.log_dir is None:
         log_dir = os.path.join(self.args.qc_dir,'logs')
     else:
         log_dir = self.args.log_dir
     if not os.path.exists(log_dir):
         mkdir(log_dir)
     # Store the QC protocol data
     qc_info['protocol'] = self.args.qc_protocol
     qc_info['fastq_dir'] = self.args.project.fastq_dir
     qc_info.save()
示例#2
0
def mkdir(newdir):
    """
    Create a directory

    The new directory should be identified using a
    specifier of the form '[[USER@]HOST:]NEWDIR'.

    Arguments:
      newdir (str): location of the new directory (can
        be on local or remote system)
    """
    newdir = Location(newdir)
    if not newdir.is_remote:
        # Local directory
        bcftbx_utils.mkdir(newdir.path)
    else:
        # Remote directory
        try:
            mkdir_cmd = applications.general.ssh_command(
                newdir.user, newdir.server, ('mkdir', newdir.path))
            print "Running %s" % mkdir_cmd
            mkdir_cmd.run_subprocess()
        except Exception as ex:
            raise Exception("Exception making remote directory %s: %s" %
                            (newdir, ex))
示例#3
0
 def create_directory(self, dirn):
     # Make the specified directory, and any leading directories
     # that don't already exist
     if not os.path.exists(dirn):
         dir_path = os.sep
         for sub_dir in dirn.split(os.sep):
             dir_path = os.path.join(dir_path, sub_dir)
             if not os.path.exists(dir_path):
                 print("Making %s" % dir_path)
                 bcf_utils.mkdir(dir_path)
def build_library_directory(analysis_dir,dest,projects=None):
    """
    Build and populate data library directory on server

    Arguments:
      analysis_dir (AnalysisDir): analysis directory to export
        files from
      dest (str): location of top-level data library directory
      projects (list): list of projects to export (default is to
        export all projects)
    
    """
    # Create and populate internal directory structure on server
    user,server,dirn = split_user_host_dir(dest)
    remote = (server is not None)
    path = os.path.join(dirn,analysis_dir.run_name)
    if remote:
        logging.critical("Dealing with remote systems not implemented")
        raise NotImplementedError("Cannot build library directory on remote system")
    run_path = os.path.join(dirn,analysis_dir.run_name)
    print "Creating %s" % run_path
    mkdir(run_path)
    for project in analysis_dir.get_projects(
            include_undetermined=False):
        if projects is not None and project.name not in projects:
            print "Ignoring project '%s'" % project.name
            continue
        project_path = os.path.join(run_path,project.name)
        print "Creating %s" % project_path
        mkdir(project_path)
        print "Populating with uncompressed Fastqs:"
        for sample in project.samples:
            for fq in sample.fastq:
                fqcp = os.path.join(project_path,
                                    os.path.basename(fq))
                if fqcp.endswith('.gz'):
                    fqcp = fqcp[0:-3]
                if os.path.exists(fqcp):
                    print "-- found: %s" % fqcp
                    continue
                print "-- %s" % fqcp
                with get_fastq_file_handle(fq) as fp:
                    with open(fqcp,'wb') as fpcp:
                        while True:
                            data = fp.read(102400)
                            if not data:
                                break
                            fpcp.write(data)
示例#5
0
def build_library_directory(analysis_dir, dest, projects=None):
    """
    Build and populate data library directory on server

    Arguments:
      analysis_dir (AnalysisDir): analysis directory to export
        files from
      dest (str): location of top-level data library directory
      projects (list): list of projects to export (default is to
        export all projects)
    
    """
    # Create and populate internal directory structure on server
    user, server, dirn = split_user_host_dir(dest)
    remote = (server is not None)
    path = os.path.join(dirn, analysis_dir.run_name)
    if remote:
        logging.critical("Dealing with remote systems not implemented")
        raise NotImplementedError(
            "Cannot build library directory on remote system")
    run_path = os.path.join(dirn, analysis_dir.run_name)
    print "Creating %s" % run_path
    mkdir(run_path)
    for project in analysis_dir.get_projects(include_undetermined=False):
        if projects is not None and project.name not in projects:
            print "Ignoring project '%s'" % project.name
            continue
        project_path = os.path.join(run_path, project.name)
        print "Creating %s" % project_path
        mkdir(project_path)
        print "Populating with uncompressed Fastqs:"
        for sample in project.samples:
            for fq in sample.fastq:
                fqcp = os.path.join(project_path, os.path.basename(fq))
                if fqcp.endswith('.gz'):
                    fqcp = fqcp[0:-3]
                if os.path.exists(fqcp):
                    print "-- found: %s" % fqcp
                    continue
                print "-- %s" % fqcp
                with get_fastq_file_handle(fq) as fp:
                    with open(fqcp, 'wb') as fpcp:
                        while True:
                            data = fp.read(102400)
                            if not data:
                                break
                            fpcp.write(data)
示例#6
0
    # Check for underlying programs
    required = ["fastq_screen"]
    if args.aligner is not None:
        required.append(args.aligner)
    else:
        logging.warning("Aligner not specified, cannot check")
    for prog in required:
        if find_program(prog) is None:
            logging.critical("couldn't find '%s'" % prog)
            sys.exit(1)

    # Make output dir
    if args.out_dir is not None:
        out_dir = os.path.abspath(args.out_dir)
        mkdir(out_dir)
    else:
        out_dir = os.getcwd()

    # Screen against 'mammalian' genomes
    tagged_fastq = fastq_screen_tag(mammalian_conf,
                                    fqr2,
                                    aligner=args.aligner,
                                    threads=args.threads,
                                    out_dir=out_dir,
                                    tempdir=out_dir)
    mammalian_tagged_fq = strip_ext(tagged_fastq,'.fastq') + '.' + \
                          os.path.basename(
                              strip_ext(mammalian_conf,'.conf')) + \
                          '.fastq'
    os.rename(tagged_fastq, mammalian_tagged_fq)
def create_analysis_dir(project,
                        top_dir=None,
                        merge_replicates=False,
                        keep_names=False,
                        dry_run=False):
    """Create and populate analysis directory for an IlluminaProject

    Creates a new directory and populates either with links to FASTQ
    files, or with 'merged' FASTQ files created by concatenating
    multiple FASTQs for each sample (which can happen for multiplexed
    runs where samples are split across multiple lanes).

    Project directory names are made up of the project name and then
    the experiment type, or just the project name if experiment type
    is not set.

    Arguments:
      project   : populated IlluminaProject object
      top_dir   : parent directory to create analysis subdirectory
                  under. Defaults to cwd if not explicitly specified
      merge_replicates: if True then creates a single FASTQ file for
                  each sample by merging multiple FASTQs together
      keep_names: if True then links to FASTQ files will have the same
                  names as the original files; by default links use the
                  shortest unique name
      dry_run   : if True then report what would be done but don't
                  actually perform any action

    Returns:
      Name of the project directory.
    
    """
    project_dir = os.path.join(top_dir,project.full_name)
    print "Creating analysis directory for project '%s'..." % project.full_name
    # Check for & create directory
    if os.path.exists(project_dir):
        print "-> %s already exists" % project_dir
    else:
        print "Making analysis directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(project_dir,mode=0775)
    # Make an empty ScriptCode directory
    scriptcode_dir = os.path.join(project_dir,"ScriptCode")
    if os.path.exists(scriptcode_dir):
        print "'ScriptCode' directory %s already exists" % scriptcode_dir
    else:
        print "Making 'ScriptCode' directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(scriptcode_dir,mode=0775)
    # Check for & create links to fastq files
    if not merge_replicates:
        for sample in project.samples:
            fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq)
            for fastq in sample.fastq:
                fastq_file = os.path.join(sample.dirn,fastq)
                if keep_names:
                    fastq_ln = os.path.join(project_dir,fastq)
                else:
                    fastq_ln = os.path.join(project_dir,fastq_names[fastq])
                if os.path.exists(fastq_ln):
                    logging.error("Failed to link to %s: %s already exists" %
                                  (fastq_file,os.path.basename(fastq_ln)))
                else:
                    print "Linking to %s" % fastq
                    if not dry_run:
                        bcf_utils.mklink(fastq_file,fastq_ln,relative=True)
    else:
        # Merge files for replicates within each sample
        for sample in project.samples:
            replicates = {}
            # Gather replicates to be merged
            for fastq in sample.fastq:
                fastq_data = IlluminaData.IlluminaFastq(fastq)
                name = "%s_%s_R%d" % (fastq_data.sample_name,
                                      fastq_data.barcode_sequence,
                                      fastq_data.read_number)
                if name not in replicates:
                    replicates[name] = []
                replicates[name].append(os.path.join(sample.dirn,fastq))
                # Sort into order
                replicates[name].sort()
            # Report detected replicates
            print "Sample %s" % sample.name
            for name in replicates:
                print "\tReplicate '%s'" % name
                for fastq in replicates[name]:
                    print "\t\t%s" % fastq
            # Do the merge
            for name in replicates:
                merged_fastq = os.path.join(project_dir,name+'.fastq')
                bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name])
    # Return directory name
    return project_dir
示例#8
0
def clone(ap, clone_dir, copy_fastqs=False, exclude_projects=False):
    """
    Make a 'clone' (i.e. copy) of an analysis directory

    Makes a functional copy of an existing analysis directory,
    including metadata and parameters, stats files, processing
    reports and project subdirectories.

    By default the 'unaligned' directory in the new directory is
    simply a symlink from the original directory; set the
    'copy_fastqs' to make copies instead.

    Arguments
      ap (AutoProcessor): autoprocessor pointing to the parent
        analysis directory
      clone_dir (str): path to the new directory to create as a
        clone (must not already exist).
      copy_fastqs (boolean): set to True to copy the Fastq files
        (otherwise default behaviour is to make symlinks)
      exclude_projects (boolean): set to True to exclude any
        projects from the parent analysis directory
    """
    clone_dir = os.path.abspath(clone_dir)
    print "Cloning into %s" % clone_dir
    if os.path.exists(clone_dir):
        # Directory already exists
        logger.critical("Target directory '%s' already exists" % clone_dir)
        raise Exception("Clone failed: target directory '%s' "
                        "already exists" % clone_dir)
    bcf_utils.mkdir(clone_dir)
    # Copy metadata and parameters
    for f in (ap.metadata_file, ap.parameter_file):
        if os.path.exists(f):
            shutil.copy(f, os.path.join(clone_dir, os.path.basename(f)))
    # Primary data directory
    if ap.params.primary_data_dir:
        primary_data_dir = os.path.join(ap.analysis_dir,
                                        ap.params.primary_data_dir)
        if os.path.isdir(primary_data_dir):
            clone_primary_data_dir = os.path.join(
                clone_dir, os.path.basename(primary_data_dir))
            print "[Primary data] making %s" % clone_primary_data_dir
            bcf_utils.mkdir(clone_primary_data_dir)
            data_dir = os.path.basename(ap.params.data_dir)
            if os.path.exists(os.path.join(primary_data_dir, data_dir)):
                clone_data_dir = os.path.join(clone_primary_data_dir, data_dir)
                print "[Primary data] symlinking %s" % clone_data_dir
                os.symlink(os.path.join(primary_data_dir, data_dir),
                           clone_data_dir)
    # Link to or copy fastqs
    if not ap.params.unaligned_dir:
        for d in (
                'Unaligned',
                'bcl2fastq',
        ):
            unaligned_dir = os.path.join(ap.analysis_dir, d)
            if os.path.isdir(unaligned_dir):
                break
            unaligned_dir = None
    else:
        unaligned_dir = os.path.join(ap.analysis_dir, ap.params.unaligned_dir)
    if os.path.isdir(unaligned_dir):
        clone_unaligned_dir = os.path.join(clone_dir,
                                           os.path.basename(unaligned_dir))
        if not copy_fastqs:
            # Link to unaligned dir
            print "[Unaligned] symlinking %s" % clone_unaligned_dir
            os.symlink(unaligned_dir, clone_unaligned_dir)
        else:
            # Copy unaligned dir
            print "[Unaligned] copying %s" % clone_unaligned_dir
            shutil.copytree(unaligned_dir, clone_unaligned_dir)
    else:
        print "[Unaligned] no 'unaligned' dir found"
    # Duplicate project directories
    projects = ap.get_analysis_projects()
    if projects and not exclude_projects:
        for project in ap.get_analysis_projects():
            print "[Projects] duplicating project '%s'" % project.name
            fastqs = project.fastqs
            new_project = AnalysisProject(
                project.name,
                os.path.join(clone_dir, project.name),
                user=project.info.user,
                PI=project.info.PI,
                library_type=project.info.library_type,
                single_cell_platform=project.info.single_cell_platform,
                organism=project.info.organism,
                run=project.info.run,
                comments=project.info.comments,
                platform=project.info.platform)
            new_project.create_directory(fastqs=fastqs,
                                         link_to_fastqs=(not copy_fastqs))
    # Copy additional files, if found
    for f in (
            "SampleSheet.orig.csv",
        ("custom_SampleSheet.csv"
         if not ap.params.sample_sheet else ap.params.sample_sheet),
        ("projects.info"
         if not ap.params.project_metadata else ap.params.project_metadata),
        ("statistics.info"
         if not ap.params.stats_file else ap.params.stats_file),
        ("per_lane_statistics.info" if not ap.params.per_lane_stats_file else
         ap.params.per_lane_stats_file),
            "statistics_full.info",
            "per_lane_sample_stats.info",
            "processing_qc.html",
    ):
        if not f:
            continue
        srcpath = os.path.join(ap.analysis_dir, f)
        if os.path.exists(srcpath):
            print "[Files] copying %s" % f
            shutil.copy(srcpath, clone_dir)
    # Create the basic set of subdirectories
    for subdir in (
            'logs',
            'ScriptCode',
    ):
        print "[Subdirectories] making %s" % subdir
        bcf_utils.mkdir(os.path.join(clone_dir, subdir))
    # Update the settings
    parameter_file = os.path.join(clone_dir,
                                  os.path.basename(ap.parameter_file))
    params = AnalysisDirParameters(
        filen=os.path.join(clone_dir, os.path.basename(ap.parameter_file)))
    for p in ("sample_sheet", "primary_data_dir"):
        if not params[p]:
            continue
        print "[Parameters] updating '%s'" % p
        params[p] = os.path.join(clone_dir,
                                 os.path.relpath(params[p], ap.analysis_dir))
    params.save()
示例#9
0
        sys.exit(1)

    # Make top-level output dirs
    icell8_dir = os.path.abspath(outdir)
    if os.path.exists(icell8_dir) and args.project is None:
        if not args.force:
            logger.fatal("Output destination '%s': already exists "
                         "(remove or use --force to overwrite)" % icell8_dir)
            sys.exit(1)
        logger.warning("Removing existing output destination '%s'" %
                       icell8_dir)
        shutil.rmtree(icell8_dir)
    log_dir = os.path.join(icell8_dir, "logs")
    scripts_dir = os.path.join(icell8_dir, "scripts")
    for dirn in (icell8_dir, log_dir, scripts_dir):
        mkdir(dirn)

    # Copy well list file into output directory
    shutil.copy(well_list, outdir)
    well_list = os.path.join(outdir, os.path.basename(well_list))
    if analysis_project is not None:
        analysis_project.info['icell8_well_list'] = os.path.basename(well_list)
        analysis_project.info.save()

    # Set up pipelines
    pipelines = []

    # ICELL QC and filtering
    print "Setting up a pipeline for ICELL processing"
    pipelines.append(
        ICell8QCFilter(outdir,
示例#10
0
def main():
    # Handle the command line
    p = argparse.ArgumentParser()
    p.add_argument("fastqs",
                   nargs='*',
                   metavar="FASTQ_R1 FASTQ_R2",
                   help="FASTQ file pairs")
    p.add_argument("-w",
                   "--well-list",
                   dest="well_list_file",
                   default=None,
                   help="iCell8 'well list' file")
    p.add_argument("-m",
                   "--mode",
                   dest="splitting_mode",
                   default="barcodes",
                   choices=["barcodes", "batch", "none"],
                   help="how to split the input FASTQs: 'barcodes' "
                   "(one FASTQ pair per barcode), 'batch' (one or "
                   "more FASTQ pairs with fixed number of reads not "
                   "exceeding BATCH_SIZE), or 'none' (output all "
                   "reads to a single FASTQ pair) (default: "
                   "'barcodes')")
    p.add_argument("-s",
                   "--size",
                   type=int,
                   dest="batch_size",
                   default=DEFAULT_BATCH_SIZE,
                   help="number of reads per batch in 'batch' mode "
                   "(default: %d)" % DEFAULT_BATCH_SIZE)
    p.add_argument("-b",
                   "--basename",
                   default="icell8",
                   help="basename for output FASTQ files (default: "
                   "'icell8')")
    p.add_argument("-o",
                   "--outdir",
                   dest="out_dir",
                   default=None,
                   help="directory to write output FASTQ files to "
                   "(default: current directory)")
    p.add_argument("-d",
                   "--discard-unknown-barcodes",
                   dest='discard_unknown_barcodes',
                   action='store_true',
                   help="discard reads with barcodes which don't "
                   "match any of those in the WELL_LIST_FILE "
                   "(default: keep all reads)")
    p.add_argument("-q",
                   "--quality-filter",
                   dest='quality_filter',
                   action='store_true',
                   help="filter reads by barcode and UMI quality "
                   "(default: don't filter reads on quality)")
    p.add_argument("-c",
                   "--compress",
                   action='store_true',
                   help="output compressed .gz FASTQ files")
    args = p.parse_args()

    # Convert quality cutoffs to character encoding
    barcode_quality_cutoff = chr(INLINE_BARCODE_QUALITY_CUTOFF + 33)
    umi_quality_cutoff = chr(UMI_QUALITY_CUTOFF + 33)

    # Get well list and expected barcodes
    well_list_file = args.well_list_file
    if well_list_file is not None:
        well_list_file = os.path.abspath(args.well_list_file)
    well_list = ICell8WellList(well_list_file)
    expected_barcodes = set(well_list.barcodes())
    print "%d expected barcodes" % len(expected_barcodes)

    # Filtering on barcode
    do_check_barcodes = args.discard_unknown_barcodes
    if do_check_barcodes and well_list_file is None:
        logging.fatal("-d/--discard-unknown-barcodes: need to supply a "
                      "well list file")
        sys.exit(1)

    # Filter on barcode and UMI quality
    do_quality_filter = args.quality_filter

    # Splitting mode
    splitting_mode = args.splitting_mode
    batch_size = args.batch_size

    # Count barcodes and rejections
    assigned = 0
    unassigned = 0
    filtered = 0
    barcode_list = set()
    filtered_counts = {}

    # Input Fastqs
    fastqs = pair_fastqs([fq for fq in args.fastqs])[0]

    # Output Fastqs
    output_fqs = BufferedOutputFiles(base_dir=args.out_dir)
    if args.out_dir is not None:
        out_dir = os.path.abspath(args.out_dir)
        mkdir(out_dir)
    else:
        out_dir = os.getcwd()
    basename = args.basename

    # Compress outputs?
    if args.compress:
        fastq_ext = "fastq.gz"
    else:
        fastq_ext = "fastq"

    # Iterate over pairs of Fastqs
    for fastq_pair in fastqs:
        # Iterate over read pairs from the Fastqs
        print "-- %s\n   %s" % fastq_pair
        print "   Starting at %s" % time.ctime()
        start_time = time.time()
        for i, read_pair in enumerate(ICell8FastqIterator(*fastq_pair),
                                      start=1):
            # Deal with read pair
            if (i % 100000) == 0:
                print "   Examining read pair #%d (%s)" % \
                    (i,time.ctime())
            inline_barcode = read_pair.barcode
            barcode_list.add(inline_barcode)
            # Initial assignment
            assign_to = inline_barcode
            # Apply quality filtering
            if do_quality_filter:
                if not pass_quality_filter(read_pair.barcode_quality,
                                           barcode_quality_cutoff):
                    assign_to = "failed_barcode"
                elif not pass_quality_filter(read_pair.umi_quality,
                                             umi_quality_cutoff):
                    assign_to = "failed_umi"
                else:
                    filtered += 1
            # Check barcode is valid
            if do_check_barcodes:
                if inline_barcode not in expected_barcodes:
                    assign_to = "unassigned"
                    unassigned += 1
                else:
                    assigned += 1
            logging.debug("%s" % '\t'.join([
                assign_to, inline_barcode, read_pair.umi,
                read_pair.min_barcode_quality, read_pair.min_umi_quality
            ]))
            # Post filtering counts
            if assign_to == inline_barcode:
                try:
                    filtered_counts[inline_barcode] += 1
                except KeyError:
                    filtered_counts[inline_barcode] = 1
                # Reassign read pair to appropriate output files
                if splitting_mode == "batch":
                    # Output to a batch-specific file pair
                    batch_number = filtered / batch_size
                    assign_to = "B%03d" % batch_number
                elif splitting_mode == "none":
                    # Output to a single file pair
                    assign_to = "filtered"
            # Write read pair
            fq_r1 = "%s_R1" % assign_to
            fq_r2 = "%s_R2" % assign_to
            if fq_r1 not in output_fqs:
                try:
                    # Try to reopen file and append
                    output_fqs.open(fq_r1, append=True)
                except KeyError:
                    # Open new file
                    output_fqs.open(
                        fq_r1,
                        "%s.%s.r1.%s" % (basename, assign_to, fastq_ext))
            output_fqs.write(fq_r1, "%s" % read_pair.r1)
            if fq_r2 not in output_fqs:
                try:
                    # Try to reopen file and append
                    output_fqs.open(fq_r2, append=True)
                except KeyError:
                    # Open new file
                    output_fqs.open(
                        fq_r2,
                        "%s.%s.r2.%s" % (basename, assign_to, fastq_ext))
            output_fqs.write(fq_r2, "%s" % read_pair.r2)
        print "   Finished at %s" % time.ctime()
        print "   (Took %.0fs)" % (time.time() - start_time)
    # Close output files
    output_fqs.close()

    # Summary output to screen
    total_reads = assigned + unassigned
    print "Summary:"
    print "--------"
    print "Number of barcodes         : %d" % len(barcode_list)
    if do_check_barcodes:
        print "Number of expected barcodes: %d/%d" % \
            (len(filtered_counts.keys()),
             len(expected_barcodes))
    print "Total reads                : %d" % total_reads
    if do_quality_filter:
        print "Total reads (filtered)     : %d" % filtered
    if do_check_barcodes:
        print "Total reads (assigned)     : %d" % assigned
        print "Unassigned reads           : %d" % unassigned
示例#11
0
    if args.sample_pattern is not None:
        samples = project.get_samples(args.sample_pattern)
    else:
        samples = project.samples
    if not samples:
        logger.warning("No samples specified for QC, quitting")
        sys.exit()
    print "%d samples matched" % len(samples)
    for sample in samples:
        print "-- %s" % sample.name

    # Set up QC dir
    qc_dir = project.setup_qc_dir(qc_dir=args.qc_dir)
    print "QC output dir: %s" % qc_dir
    log_dir = os.path.join(qc_dir,'logs')
    mkdir(log_dir)
    qc_base = os.path.basename(qc_dir)

    # Output file name
    if args.filename is None:
        out_file = '%s_report.html' % qc_base
    else:
        out_file = args.filename
    if not os.path.isabs(out_file):
        out_file = os.path.join(project.dirn,out_file)
    print "QC report: %s" % out_file

    # Run the QC
    announce("Running QC")
    max_jobs = __settings.general.max_concurrent_jobs
    sched = SimpleScheduler(runner=qc_runner,
def merge_fastq_dirs(ap,
                     primary_unaligned_dir,
                     output_dir=None,
                     dry_run=False):
    """
    Combine multiple 'unaligned' output directories into one
    
    This method combines the output from multiple runs of
    CASAVA/bcl2fastq into a single 'unaligned'-equivalent
    directory.

    Currently it operates in an automatic mode and should
    detect additional 'unaligned' dirs on its own.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the parent
        analysis directory
      primary_unaligned_dir (str): the 'unaligned' dir that
        data from from all others will be put into (relative
        path), unless overridden by 'output_dir' argument
      output_dir (str): optional, new 'unaligned' dir that
        will be created to hold merged data (relative path,
        defaults to 'primary_unaligned_dir')
      dry_run (boolean): if True then just report operations
        that would have been performed.
    """
    if primary_unaligned_dir is None:
        raise Exception("Primary unaligned dir not defined")
    # Output directory
    if output_dir is None:
        output_dir = primary_unaligned_dir
    print("Fastqs will be merged into '%s'" % output_dir)
    # Collect unaligned dirs
    print("Collecting bcl2fastq directories")
    primary_illumina_data = None
    unaligned_dirs = {}
    for dirn in list_dirs(ap.analysis_dir):
        try:
            illumina_data = IlluminaData.IlluminaData(ap.analysis_dir,
                                                      unaligned_dir=dirn)
            if dirn == primary_unaligned_dir:
                print("* %s (primary dir)" % dirn)
                primary_illumina_data = illumina_data
            elif dirn.endswith(".bak") or dirn.startswith("save."):
                print("Ignoring %s" % dirn)
            else:
                print("* %s" % dirn)
                unaligned_dirs[dirn] = illumina_data
        except Exception as ex:
            logger.debug("Rejecting %s: %s" % (dirn, ex))
    # Check primary unaligned dir
    if primary_illumina_data is None:
        raise Exception("Primary dir '%s' doesn't exist, or doesn't "
                        "contain data?" % primary_unaligned_dir)
    # Is there anything to do?
    if not unaligned_dirs:
        print("No extra bcl2fastq output directories found, nothing to do")
        return 0
    # Make log directory and set up scheduler (if not dry run)
    if not dry_run:
        ap.set_log_dir(ap.get_log_subdir('merge_fastq_dirs'))
        runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        sched = SimpleScheduler(
            runner=runner,
            max_concurrent=ap.settings.general.max_concurrent_jobs,
            poll_interval=ap.settings.general.poll_interval)
        sched.start()
        jobs = []
    # Top-level for undetermined reads
    if primary_illumina_data.undetermined.dirn != \
       primary_illumina_data.unaligned_dir:
        undetermined_dir = os.path.basename(
            primary_illumina_data.undetermined.dirn)
    else:
        undetermined_dir = None
    # Do sanity checks before proceeding
    print("Checking primary data directory")
    fmt = primary_illumina_data.format
    paired_end = primary_illumina_data.paired_end
    no_lane_splitting = (len(primary_illumina_data.lanes) == 1) \
                        and (primary_illumina_data.lanes[0] is None)
    print("* Format: %s" % fmt)
    print("* no-lane-splitting: %s" % ('yes' if no_lane_splitting else 'no'))
    print("* paired-end: %s" % ('yes' if paired_end else 'no'))
    print("* undetermined dir: %s" % undetermined_dir)
    consistent_data = True
    for unaligned_dir in unaligned_dirs:
        illumina_data = unaligned_dirs[unaligned_dir]
        fmt0 = illumina_data.format
        no_lane_splitting0 = (len(illumina_data.lanes) == 1) \
                             and (primary_illumina_data.lanes[0] is None)
        if (fmt0 != fmt) or (no_lane_splitting0 != no_lane_splitting):
            print("!!! %s: inconsistent format to primary data dir !!!" %
                  unaligned_dir)
            consistent_data = False
    if not consistent_data:
        raise Exception("Data directories not consistent with primary "
                        "dir '%s'" % primary_unaligned_dir)
    # Collect the projects from the extra directories
    projects = []
    undetermined = []
    for unaligned_dir in unaligned_dirs:
        print("Examining projects in %s:" % unaligned_dir)
        illumina_data = unaligned_dirs[unaligned_dir]
        for project in illumina_data.projects:
            if not list(filter(lambda p: p.name == project.name, projects)):
                print("- %s: will be merged in" % project.name)
                projects.append(project)
            else:
                raise Exception("collision: %s already exists" % project.name)
        # Deal with undetermined reads
        if illumina_data.undetermined is not None:
            print("Examining undetermined samples:")
            if no_lane_splitting:
                # No lane info: should merge undetermined fastqs
                for sample in illumina_data.undetermined.samples:
                    print("- %s: reads will be concatenated" % sample.name)
                    undetermined.append(sample)
            else:
                for sample in illumina_data.undetermined.samples:
                    if not list(
                            filter(lambda s: s.name == sample.name,
                                   undetermined)):
                        print("- %s: will be merged in" % sample.name)
                        undetermined.append(sample)
                    else:
                        raise Exception("collision: %s already exists" %
                                        sample.name)
        else:
            print("No undetermined samples")
    # Collect any remaining projects from the primary
    # unaligned directory
    print("Examining projects in primary dir %s:" % primary_unaligned_dir)
    for project in primary_illumina_data.projects:
        if not list(filter(lambda p: p.name == project.name, projects)):
            print("- %s: will be merged in" % project.name)
            projects.append(project)
        else:
            print("- %s: already exists, will be discarded" % project.name)
    # Sort out the undetermined reads
    print("Examining undetermined samples:")
    if no_lane_splitting:
        # No lane info: should merge undetermined fastqs
        for sample in primary_illumina_data.undetermined.samples:
            print("- %s: reads will be concatenated" % sample.name)
            undetermined.insert(0, sample)
    else:
        for sample in primary_illumina_data.undetermined.samples:
            if not list(filter(lambda s: s.name == sample.name, undetermined)):
                print("- %s: will be merged in" % sample.name)
                undetermined.insert(0, sample)
            else:
                print("- %s: already exists, will be discarded" % sample.name)
    # Make a new directory for the merging
    merge_dir = os.path.join(ap.analysis_dir, output_dir + ".new")
    if undetermined_dir is not None:
        merge_undetermined_dir = os.path.join(merge_dir, undetermined_dir)
    else:
        merge_undetermined_dir = merge_dir
    if not dry_run:
        print("Making temporary merge directory %s" % merge_dir)
        mkdir(merge_dir)
        if not os.path.exists(merge_undetermined_dir):
            print("Making directory for undetermined %s" %
                  merge_undetermined_dir)
            mkdir(merge_undetermined_dir)
    # Copy the projects
    print("Importing projects:")
    for project in projects:
        print("- %s" % project.name)
        project_dir = os.path.join(merge_dir, os.path.basename(project.dirn))
        cmd = copytree_command(project.dirn, project_dir)
        print("- Running %s" % cmd)
        if not dry_run:
            job = sched.submit(cmd,
                               name="copy_project.%s" % project.name,
                               wd=merge_dir)
            print("Job: %s" % job)
            jobs.append(job)
    # Handle the undetermined reads
    print("Dealing with undetermined reads:")
    if no_lane_splitting:
        # No lane info: merge undetermined fastqs
        if len(undetermined) == 1:
            # Only one undetermined sample - copy Fastqs
            for read in (1, 2):
                if read == 2 and not paired_end:
                    break
                fastqs = sample.fastq_subset(read_number=read, full_path=True)
                for fq in fastqs:
                    cmd = copy_command(fq, merge_undetermined_dir)
                    print("- Running %s" % cmd)
                    if not dry_run:
                        job = sched.submit(cmd,
                                           name="copy_undetermined.R%s" % read,
                                           wd=merge_dir)
                        print("Job: %s" % job)
                        jobs.append(job)
        else:
            # Multiple undetermined samples - concat Fastqs
            for read in (1, 2):
                if read == 2 and not paired_end:
                    break
                cmd = Command('concat_fastqs.py')
                for sample in undetermined:
                    fastqs = sample.fastq_subset(read_number=read,
                                                 full_path=True)
                    cmd.add_args(*fastqs)
                cmd.add_args(
                    os.path.join(merge_undetermined_dir,
                                 "Undetermined_S0_R%s_001.fastq.gz" % read))
                print("- Running %s" % cmd)
                if not dry_run:
                    job = sched.submit(cmd,
                                       name="merge_undetermined.R%s" % read,
                                       wd=merge_dir)
                    print("Job: %s" % job)
                    jobs.append(job)
    else:
        for sample in undetermined:
            print("- %s" % sample.name)
            if fmt == "bcl2fastq2":
                # Hardlink copy fastqs directly
                sample_dir = merge_undetermined_dir
                if not dry_run:
                    for fq in sample.fastq:
                        src_fq = os.path.join(sample.dirn, fq)
                        dst_fq = os.path.join(sample_dir, fq)
                        os.link(src_fq, dst_fq)
            else:
                # Just copy directory tree wholesale
                sample_dir = os.path.join(merge_undetermined_dir,
                                          os.path.basename(sample.dirn))
                cmd = copytree_command(sample.dirn, sample_dir)
                print("- Running %s" % cmd)
                if not dry_run:
                    job = sched.submit(cmd,
                                       name="copy_sample_dir.%s" % sample.name,
                                       wd=merge_dir)
                    print("Job: %s" % job.name)
                    jobs.append(job)
    # Make expected subdirs for bcl2fastq2
    if not dry_run and fmt == "bcl2fastq2":
        for dirn in ('Reports', 'Stats'):
            mkdir(os.path.join(merge_dir, dirn))
            # Add a hidden placeholder to preserve these directories
            # on rsync -m (prune empty dirs)
            with open(os.path.join(merge_dir, dirn, '.placeholder'),
                      'w') as fp:
                fp.write("")
    # Wait for scheduler jobs to complete
    if not dry_run:
        sched.wait()
        sched.stop()
        # Check job exit status
        exit_status = 0
        for j in jobs:
            exit_status += j.exit_status
            if j.exit_status != 0:
                logger.warning("Job failed: %s" % j)
        if exit_status:
            logger.critical("One or more jobs failed (non-zero "
                            "exit status)")
            return exit_status
    # Move all the 'old' directories out of the way
    all_unaligned = [u for u in unaligned_dirs]
    all_unaligned.append(primary_unaligned_dir)
    for unaligned_dir in all_unaligned:
        unaligned_backup = os.path.join(ap.analysis_dir,
                                        "save.%s" % unaligned_dir)
        print("Moving %s to %s" % (unaligned_dir, unaligned_backup))
        if not dry_run:
            shutil.move(os.path.join(ap.analysis_dir, unaligned_dir),
                        unaligned_backup)
    # Rename the merged directory
    print("Renaming %s to %s" % (merge_dir, output_dir))
    if not dry_run:
        shutil.move(merge_dir, os.path.join(ap.analysis_dir, output_dir))
    # Reset the bcl2fastq dir
    if not dry_run:
        ap.params['unaligned_dir'] = output_dir
    # Make a new 'projects.info' metadata file
    project_metadata_file = os.path.join(ap.analysis_dir, 'projects.info')
    if os.path.exists(project_metadata_file):
        print("Moving existing projects.info file out of the way")
        if not dry_run:
            os.rename(project_metadata_file,
                      os.path.join(ap.analysis_dir, 'save.projects.info'))
    print("Creating new projects.info file")
    if not dry_run:
        ap.make_project_metadata_file()
    return 0
示例#13
0
    def create_directory(self,
                         illumina_project=None,
                         fastqs=None,
                         fastq_dir=None,
                         short_fastq_names=False,
                         link_to_fastqs=False):
        """Create and populate analysis directory for an IlluminaProject

        Creates a new directory corresponding to the AnalysisProject
        object, and optionally also populates with links to FASTQ files
        from a supplied IlluminaProject object.

        The directory structure it creates is:

        dir/
           fastqs/
           logs/
           ScriptCode/

        It also creates an info file with metadata about the project.

        Arguments:
          illumina_project: (optional) populated IlluminaProject object
            from which the analysis directory will be populated
          fastqs: (optional) list of fastq files to import
          fastq_dir: (optional) name of subdirectory to put fastq files
            into; defaults to 'fastqs'
          short_fastq_names: (optional) if True then transform fastq file
            names to be the shortest possible unique names; if False
            (default) then use the original fastq names
          link_to_fastqs: (optional) if True then make symbolic links to
            to the fastq files; if False (default) then make hard links
    
        """
        logger.debug("Creating analysis directory for project '%s'" %
                     self.name)
        # Check for & create directory
        if os.path.exists(self.dirn):
            logger.warning("Directory %s already exists" % self.dirn)
        else:
            logger.debug("Making analysis directory %s" % self.dirn)
            bcf_utils.mkdir(self.dirn, mode=0775)
        # Make a 'ScriptCode' directory
        scriptcode_dir = os.path.join(self.dirn, "ScriptCode")
        bcf_utils.mkdir(scriptcode_dir, mode=0775)
        # Put a file in ScriptCode to make sure it's
        # not pruned on subsequent rsync operations
        fp = open(os.path.join(self.dirn, 'ScriptCode', 'README.txt'), 'w')
        fp.write(
            "The ScriptCode directory is a place to put custom scripts and programs"
        )
        fp.close()
        # Make a 'fastqs' directory
        if fastq_dir is None:
            fastq_dir = "fastqs"
        fastq_dir = os.path.join(self.dirn, fastq_dir)
        bcf_utils.mkdir(fastq_dir, mode=0775)
        # Check for & create links to fastq files
        if fastqs is None:
            # Make a list of fastqs to import from the supplied
            # IlluminaProject object
            fastqs = []
            if illumina_project is not None:
                for sample in illumina_project.samples:
                    for fastq in sample.fastq:
                        fastqs.append(os.path.join(sample.dirn, fastq))
        if short_fastq_names:
            # Get mapping to (shortened) unique names
            fastq_names = IlluminaData.get_unique_fastq_names(fastqs)
        else:
            # Use full names
            fastq_names = {}
            for fq in fastqs:
                fastq_names[fq] = os.path.basename(fq)
        for fastq in fastqs:
            target_fq = os.path.join(fastq_dir, fastq_names[fastq])
            if os.path.exists(target_fq):
                logger.warning("Target '%s' already exists" % target_fq)
            else:
                if link_to_fastqs:
                    logger.debug("Making symlink to %s" % fastq)
                    bcf_utils.mklink(fastq, target_fq, relative=True)
                else:
                    logger.debug("Making hard link to %s" % fastq)
                    os.link(fastq, target_fq)
        # Populate
        self.populate(fastq_dir=os.path.basename(fastq_dir))
        # Update metadata: primary fastq dir
        self.info['primary_fastq_dir'] = os.path.relpath(fastq_dir, self.dirn)
        # Update metadata: sample summary
        self.info['samples'] = self.sample_summary()
        # Save metadata
        self.info.save(self.info_file)
示例#14
0
    def setup_qc_dir(self, qc_dir=None, fastq_dir=None):
        """
        Set up a QC outputs directory

        Creates a QC outputs directory with a metadata
        file 'qc.info'.

        Arguments:
          qc_dir (str): path to QC outputs directory
            to set up. If a relative path is supplied then
            is assumed to be relative to the analysis
            project directory. If 'None' then defaults to
            the current 'qc_dir' for the project.
          fastq_dir (str): set the associated source Fastq
            directory (optional). If 'None' then defaults
            to the previously associated fastq_dir for the
            QC dir (or the current 'fastq_dir' for the
            project if that isn't set).

        Returns:
          String: full path to the QC directory.

        Raises:
          Exception: if previously stored Fastq source dir
            doesn't match the one supplied via 'fastq_dir'.
        """
        print "Setting up QC directory"
        if qc_dir is None:
            qc_dir = os.path.relpath(self.qc_dir, self.dirn)
            print "Assuming default QC dir: %s" % qc_dir
        if not os.path.isabs(qc_dir):
            qc_dir = os.path.join(self.dirn, qc_dir)
        if not os.path.exists(qc_dir):
            print "Creating QC dir: %s" % qc_dir
            bcf_utils.mkdir(qc_dir, mode=0775)
        else:
            print "QC dir already exists: %s" % qc_dir
        # Set up metadata
        qc_info = self.qc_info(qc_dir)
        print "qc_dir            : %s" % qc_dir
        print "Supplied fastq_dir: %s" % fastq_dir
        print "Stored fastq_dir  : %s" % qc_info.fastq_dir
        if fastq_dir is None:
            if qc_info.fastq_dir is not None:
                fastq_dir = qc_info.fastq_dir
                print "Using stored Fastq dir for this QC dir"
            else:
                fastq_dir = os.path.relpath(self.fastq_dir, self.dirn)
                print "Assuming default Fastq dir: %s" % fastq_dir
        if qc_info.fastq_dir is not None:
            if qc_info.fastq_dir != fastq_dir:
                raise Exception(
                    "Project '%s': supplied Fastq dir ('%s') "
                    "differs from stored dir ('%s') for QC "
                    "dir '%s'" %
                    (self.name, fastq_dir, qc_info.fastq_dir, qc_dir))
        print "Setting associated Fastq dir: %s" % fastq_dir
        qc_info['fastq_dir'] = fastq_dir
        qc_info.save()
        # Return the path to the QC directory
        return qc_dir
示例#15
0
    def run(self,nthreads=None,fastq_strand_indexes=None,
            fastq_subset=None,working_dir=None,log_file=None,
            batch_size=None,max_jobs=1,poll_interval=5,
            runners=None,default_runner=None,verbose=False):
        """
        Run the tasks in the pipeline

        Arguments:
          nthreads (int): number of threads/processors to
            use for QC jobs (defaults to 1)
          fastq_strand_indexes (dict): mapping of organism
            IDs to directories with STAR index
          fastq_subset (int): explicitly specify
            the subset size for subsetting running Fastqs
          working_dir (str): optional path to a working
            directory (defaults to temporary directory in
            the current directory)
          log_dir (str): path of directory where log files
            will be written to
          batch_size (int): if set then run commands in
            each task in batches, with each batch running
            this many commands at a time (default is to run
            one command per job)
          max_jobs (int): optional maximum number of
            concurrent jobs in scheduler (defaults to 1)
          poll_interval (float): optional polling interval
            (seconds) to set in scheduler (defaults to 5s)
          runners (dict): mapping of names to JobRunner
            instances; valid names are 'qc_runner',
            'report_runner','verify_runner','default'
          default_runner (JobRunner): optional default
            job runner to use
          verbose (bool): if True then report additional
            information for diagnostics
        """
        # Working directory
        clean_up_on_completion = False
        if working_dir is None:
            working_dir = tempfile.mkdtemp(prefix="__qc.",
                                           suffix=".tmp",
                                           dir=os.getcwd())
            clean_up_on_completion = True
        working_dir = os.path.abspath(working_dir)
        if not os.path.exists(working_dir):
            mkdir(working_dir)

        # Log and script directories
        log_dir = os.path.join(working_dir,"logs")
        scripts_dir = os.path.join(working_dir,"scripts")

        # Execute the pipeline
        status = Pipeline.run(self,
                              working_dir=working_dir,
                              log_dir=log_dir,
                              scripts_dir=scripts_dir,
                              log_file=log_file,
                              batch_size=batch_size,
                              exit_on_failure=PipelineFailure.DEFERRED,
                              params={
                                  'nthreads': nthreads,
                                  'fastq_subset': fastq_subset,
                                  'fastq_strand_indexes': fastq_strand_indexes,
                              },
                              max_jobs=max_jobs,
                              runners=runners,
                              default_runner=default_runner,
                              verbose=verbose)

        # Clean up working dir
        if status == 0 and clean_up_on_completion:
            shutil.rmtree(working_dir)

        # Return pipeline status
        return status