def write_output(stats_file_path, image_index, base_name, fastq_image_aligner, path_info, all_tile_data, make_pdfs, um_per_pixel): all_read_rcs_filepath = os.path.join(path_info.results_directory, base_name, '{}_all_read_rcs.txt'.format(image_index)) # if we've already aligned this channel with a different strategy, the current alignment may or may not be better # here we load some data so we can make that comparison existing_score = load_existing_score(stats_file_path) new_stats = fastq_image_aligner.alignment_stats if existing_score > 0: log.debug("Alignment already exists for %s/%s, skipping. Score difference: %d." % (base_name, image_index, (new_stats.score - existing_score))) return False # save information about how to align the images log.info("Saving alignment with score of %s\t\t%s" % (new_stats.score, base_name)) with open(stats_file_path, 'w') as f: f.write(new_stats.serialized) # save the corrected location of each read all_fastq_image_aligner = fastqimagealigner.FastqImageAligner(um_per_pixel) all_fastq_image_aligner.all_reads_fic_from_aligned_fic(fastq_image_aligner, all_tile_data) with open(all_read_rcs_filepath, 'w') as f: for line in all_fastq_image_aligner.read_names_rcs: f.write(line) # save some diagnostic PDFs that give a nice visualization of the alignment if make_pdfs: ax = plotting.plot_all_hits(fastq_image_aligner) ax.figure.savefig(os.path.join(path_info.figure_directory, base_name, '{}_all_hits.pdf'.format(image_index))) plt.close() ax = plotting.plot_hit_hists(fastq_image_aligner) ax.figure.savefig(os.path.join(path_info.figure_directory, base_name, '{}_hit_hists.pdf'.format(image_index))) plt.close() del all_fastq_image_aligner del fastq_image_aligner return True
def run_data_channel(cluster_strategy, h5_filenames, channel_name, path_info, alignment_tile_data, all_tile_data, metadata, clargs, process_limit): image_count = count_images(h5_filenames, channel_name) num_processes, chunksize = calculate_process_count(image_count) if process_limit > 0: num_processes = min(process_limit, num_processes) log.debug("Aligning data images with %d cores with chunksize %d" % (num_processes, chunksize)) log.debug("Loading reads into FASTQ Image Aligner.") fastq_image_aligner = fastqimagealigner.FastqImageAligner(metadata['microns_per_pixel']) fastq_image_aligner.load_reads(alignment_tile_data) log.debug("Reads loaded.") second_processor = functools.partial(process_data_image, cluster_strategy, path_info, all_tile_data, clargs.microns_per_pixel, clargs.make_pdfs, channel_name, fastq_image_aligner, clargs.min_hits) for h5_filename in h5_filenames: pool = multiprocessing.Pool(num_processes) log.debug("Doing second channel alignment of all images with %d cores" % num_processes) pool.map_async(second_processor, load_aligned_stats_files([h5_filename], metadata['alignment_channel'], path_info), chunksize=chunksize).get(sys.maxint) pool.close() pool.join() gc.collect() log.debug("Done aligning!")
def main(clargs): metadata = initialize.load_metadata(clargs.image_directory) cache = initialize.load_cache(clargs.image_directory) if not cache['preprocessed']: preprocess(clargs.image_directory, cache) h5_filenames = load_filenames(clargs.image_directory) if len(h5_filenames) == 0: error.fail( "There were no HDF5 files to process. You must have deleted or moved them after preprocessing them." ) path_info = PathInfo(clargs.image_directory, metadata['mapped_reads'], metadata['perfect_target_name'], metadata['alternate_fiducial_reads'], metadata['alternate_perfect_target_reads_filename'], metadata['alternate_good_target_reads_filename']) # Ensure we have the directories where output will be written align.make_output_directories(h5_filenames, path_info) log.debug("Loading tile data.") sequencing_chip = chip.load(metadata['chip_type'])( metadata['ports_on_right']) alignment_tile_data = align.load_read_names( path_info.aligning_read_names_filepath) perfect_tile_data = align.load_read_names(path_info.perfect_read_names) on_target_tile_data = align.load_read_names(path_info.on_target_read_names) all_tile_data = align.load_read_names(path_info.all_read_names_filepath) log.debug("Tile data loaded.") # We use one process per concentration. We could theoretically speed this up since our machine # has significantly more cores than the typical number of concentration points, but since it # usually finds a result in the first image or two, it's not going to deliver any practical benefits log.debug("Loading FastQImageAligner") fia = fastqimagealigner.FastqImageAligner(clargs.microns_per_pixel) fia.load_reads(alignment_tile_data) log.debug("Loaded %s points" % sum([len(v) for v in alignment_tile_data.values()])) log.debug("FastQImageAligner loaded.") if 'end_tiles' not in cache: end_tiles = align.get_end_tiles(cluster_strategies, clargs.rotation_adjustment, h5_filenames, metadata['alignment_channel'], clargs.snr, metadata, sequencing_chip, fia) cache['end_tiles'] = end_tiles initialize.save_cache(clargs.image_directory, cache) else: log.debug("End tiles already calculated.") end_tiles = cache['end_tiles'] gc.collect() if not cache['phix_aligned']: for cluster_strategy in cluster_strategies: align.run(cluster_strategy, clargs.rotation_adjustment, h5_filenames, path_info, clargs.snr, clargs.min_hits, fia, end_tiles, metadata['alignment_channel'], all_tile_data, metadata, clargs.make_pdfs, sequencing_chip, clargs.process_limit) cache['phix_aligned'] = True initialize.save_cache(clargs.image_directory, cache) else: log.debug("Phix already aligned.") if clargs.fiducial_only: # the user doesn't want us to align the protein channels exit(0) gc.collect() protein_channels = [ channel for channel in projectinfo.load_channels(clargs.image_directory) if channel != metadata['alignment_channel'] ] if protein_channels: log.debug("Protein channels found: %s" % ", ".join(protein_channels)) else: # protein is in phix channel, hopefully? log.warn( "No protein channels detected. Assuming protein is in phiX channel: %s" % [metadata['alignment_channel']]) protein_channels = [metadata['alignment_channel']] for channel_name in protein_channels: # Attempt to precision align protein channels using the phix channel alignment as a starting point. # Not all experiments have "on target" or "perfect target" reads - that only applies to CRISPR systems # (at the time of this writing anyway) for cluster_strategy in cluster_strategies: gc.collect() if on_target_tile_data: channel_combo = channel_name + "_on_target" combo_align(cluster_strategy, h5_filenames, channel_combo, channel_name, path_info, on_target_tile_data, all_tile_data, metadata, cache, clargs) gc.collect() if perfect_tile_data: channel_combo = channel_name + "_perfect_target" combo_align(cluster_strategy, h5_filenames, channel_combo, channel_name, path_info, perfect_tile_data, all_tile_data, metadata, cache, clargs) gc.collect()