def main(): if len(sys.argv) != 2: print("Please provide one argument: manifest file!") sys.exit() manifest_file = sys.argv[1] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file, "r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() if 'flowcell_directory' not in options: print( 'flowcell_directory is not specified in the manifest file. Exiting...' ) sys.exit() if 'output_folder' not in options: print( 'output_folder is not specified in the manifest file. Exiting...') sys.exit() if 'metadata_file' not in options: print( 'metadata_file is not specified in the manifest file. Exiting...') sys.exit() if 'flowcell_barcode' not in options: print( 'flowcell_barcode is not specified in the manifest file. Exiting...' ) sys.exit() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] dropseq_folder = options[ 'dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options[ 'picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' STAR_folder = options[ 'STAR_folder'] if 'STAR_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/STAR-2.5.2a' scripts_folder = options[ 'scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' email_address = options[ 'email_address'] if 'email_address' in options else '' if not os.path.isdir(flowcell_directory): print( "Folder {} does not exist. Exiting...".format(flowcell_directory)) sys.exit() if not os.path.isfile(metadata_file): print("File {} does not exist. Exiting...".format(metadata_file)) sys.exit() if not os.path.isdir(dropseq_folder): print("Folder {} does not exist. Exiting...".format(dropseq_folder)) sys.exit() if not os.path.isdir(picard_folder): print("Folder {} does not exist. Exiting...".format(picard_folder)) sys.exit() if not os.path.isdir(STAR_folder): print("Folder {} does not exist. Exiting...".format(STAR_folder)) sys.exit() if not os.path.isdir(scripts_folder): print("Folder {} does not exist. Exiting...".format(scripts_folder)) sys.exit() library_folder = options[ 'library_folder'] if 'library_folder' in options else '{}/libraries'.format( output_folder) runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) if not os.path.isfile(runinfo_file): print("File {} does not exist. Exiting...".format(runinfo_file)) sys.exit() try: # Create directories if not os.path.isdir(output_folder): call(['mkdir', '-p', output_folder]) if not os.path.isdir('{}/logs'.format(output_folder)): call(['mkdir', '-p', '{}/logs'.format(output_folder)]) call(['mkdir', '-p', '{}/status'.format(output_folder)]) if not os.path.isdir(library_folder): call(['mkdir', '-p', library_folder]) if 'temp_folder' not in options: call(['mkdir', '-p', '{}/tmp'.format(output_folder)]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) print("Folder {} cannot be created. Exiting...".format(output_folder)) sys.exit() log_file = '{}/logs/workflow.log'.format(output_folder) write_log(log_file, flowcell_barcode, "The Slide-seq alignment pipeline is starting to run. ") # Call run_preparation output_file = '{}/logs/run_preparation.log'.format(output_folder) submission_script = '{}/run_preparation.sh'.format(scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=20g', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, scripts_folder, output_folder ] call_to_taskrunner(output_folder, call_args) if len(email_address) > 1: subject = "Submission received for " + flowcell_barcode content = "Thank you for your interest on the Slide-seq tools! We received your request. An email will be sent to you once the workflow finishes. " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args)
def main(): if len(sys.argv) != 3: print("Please provide two arguments: manifest file and lane ID!") sys.exit() manifest_file = sys.argv[1] lane = sys.argv[2] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file, "r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] library_folder = options[ 'library_folder'] if 'library_folder' in options else '{}/libraries'.format( output_folder) tmpdir = options[ 'temp_folder'] if 'temp_folder' in options else '{}/tmp'.format( output_folder) dropseq_folder = options[ 'dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options[ 'picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' STAR_folder = options[ 'STAR_folder'] if 'STAR_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/STAR-2.5.2a' scripts_folder = options[ 'scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' is_NovaSeq = str2bool( options['is_NovaSeq']) if 'is_NovaSeq' in options else False is_NovaSeq_S4 = str2bool( options['is_NovaSeq_S4']) if 'is_NovaSeq_S4' in options else False num_slice_NovaSeq = int( options['num_slice_NovaSeq']) if 'num_slice_NovaSeq' in options else 10 num_slice_NovaSeq_S4 = int(options['num_slice_NovaSeq_S4'] ) if 'num_slice_NovaSeq_S4' in options else 40 email_address = options[ 'email_address'] if 'email_address' in options else '' basecalls_dir = '{}/Data/Intensities/BaseCalls'.format(flowcell_directory) log_file = '{}/logs/workflow.log'.format(output_folder) # Get read structure from RunInfo.xml runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) read_structure = get_read_structure(runinfo_file) # Get tile information from RunInfo.xml slice_id = {} slice_first_tile = {} slice_tile_limit = {} tile_nums = get_tiles(runinfo_file, lane) tile_cou = len(tile_nums) if ((not is_NovaSeq) and (not is_NovaSeq_S4)): slice_id[lane] = ['0'] slice_first_tile[lane] = [str(tile_nums[0])] slice_tile_limit[lane] = [str(tile_cou)] else: slice_cou = num_slice_NovaSeq if is_NovaSeq else num_slice_NovaSeq_S4 tile_cou_per_slice = (tile_cou // slice_cou) + 1 slice_id[lane] = [] slice_first_tile[lane] = [] slice_tile_limit[lane] = [] for i in range(slice_cou): if (tile_cou_per_slice * i >= tile_cou): break slice_id[lane].append(str(i)) slice_first_tile[lane].append( str(tile_nums[tile_cou_per_slice * i])) slice_tile_limit[lane].append(str(tile_cou_per_slice)) folder_running = '{}/status/running.processbarcodes_lane_{}'.format( output_folder, lane) folder_finished = '{}/status/finished.processbarcodes_lane_{}'.format( output_folder, lane) folder_failed = '{}/status/failed.processbarcodes_lane_{}'.format( output_folder, lane) try: call(['mkdir', '-p', folder_running]) now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) # Extract Illumina barcodes commandStr = 'java -Djava.io.tmpdir=' + tmpdir + ' -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx4000m ' commandStr += '-jar ' + picard_folder + '/picard.jar ExtractIlluminaBarcodes TMP_DIR=' + tmpdir + ' VALIDATION_STRINGENCY=SILENT ' commandStr += 'BASECALLS_DIR=' + basecalls_dir + ' OUTPUT_DIR=' + output_folder + '/' + lane + '/barcodes LANE=' + lane + ' ' commandStr += 'READ_STRUCTURE=' + read_structure + ' BARCODE_FILE=' + output_folder + '/' + lane + '/barcode_params.txt ' commandStr += 'METRICS_FILE=' + output_folder + '/' + lane + '/' + flowcell_barcode + '.' + lane + '.barcode_metrics COMPRESS_OUTPUTS=true NUM_PROCESSORS=4' write_log( log_file, flowcell_barcode, "ExtractIlluminaBarcodes for Lane " + lane + " Command=" + commandStr) os.system(commandStr) write_log(log_file, flowcell_barcode, "ExtractIlluminaBarcodes for Lane " + lane + " is done. ") # Convert Illumina base calls to sam (unmapped.bam) for i in range(len(slice_id[lane])): commandStr = 'java -Djava.io.tmpdir=' + tmpdir + ' -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx10192m ' commandStr += '-jar ' + picard_folder + '/picard.jar IlluminaBasecallsToSam TMP_DIR=' + tmpdir + ' VALIDATION_STRINGENCY=SILENT ' commandStr += 'BASECALLS_DIR=' + basecalls_dir + ' LANE=' + lane + ' RUN_BARCODE=' + flowcell_barcode + ' NUM_PROCESSORS=4 ' commandStr += 'READ_STRUCTURE=' + read_structure + ' LIBRARY_PARAMS=' + output_folder + '/' + lane + '/' + slice_id[ lane][i] + '/library_params.txt INCLUDE_NON_PF_READS=false ' commandStr += 'APPLY_EAMSS_FILTER=false MAX_READS_IN_RAM_PER_TILE=600000 ADAPTERS_TO_CHECK=null IGNORE_UNEXPECTED_BARCODES=true' commandStr += ' SEQUENCING_CENTER=BI BARCODES_DIR=' + output_folder + '/' + lane + '/barcodes FIRST_TILE=' + slice_first_tile[ lane][i] + ' TILE_LIMIT=' + slice_tile_limit[lane][i] output_file = '{}/logs/run_barcodes2sam_lane_{}_{}.log'.format( output_folder, lane, slice_id[lane][i]) submission_script = '{}/run_barcodes2sam.sh'.format(scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=100G', '-notify', '-l', 'h_rt=50:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, commandStr, lane, slice_id[lane][i], scripts_folder, output_folder, '{}/{}'.format(output_folder, lane) ] call_to_taskrunner(output_folder, call_args) now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) call(['mv', folder_running, folder_finished]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) if os.path.isdir(folder_running): call(['mv', folder_running, folder_failed]) else: call(['mkdir', '-p', folder_failed]) if len(email_address) > 1: subject = "Slide-seq workflow failed for " + flowcell_barcode content = "The Slide-seq workflow for lane " + lane + " failed at the step of processing barcodes. Please check the log file for the issues. " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args) sys.exit()
def main(): if len(sys.argv) != 4: print( "Please provide three arguments: manifest file, library ID and locus function!" ) sys.exit() manifest_file = sys.argv[1] library = sys.argv[2] locus_function_list = sys.argv[3] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file, "r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] library_folder = options[ 'library_folder'] if 'library_folder' in options else '{}/libraries'.format( output_folder) tmpdir = options[ 'temp_folder'] if 'temp_folder' in options else '{}/tmp'.format( output_folder) dropseq_folder = options[ 'dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options[ 'picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' STAR_folder = options[ 'STAR_folder'] if 'STAR_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/STAR-2.5.2a' scripts_folder = options[ 'scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' is_NovaSeq = str2bool( options['is_NovaSeq']) if 'is_NovaSeq' in options else False is_NovaSeq_S4 = str2bool( options['is_NovaSeq_S4']) if 'is_NovaSeq_S4' in options else False num_slice_NovaSeq = int( options['num_slice_NovaSeq']) if 'num_slice_NovaSeq' in options else 10 num_slice_NovaSeq_S4 = int(options['num_slice_NovaSeq_S4'] ) if 'num_slice_NovaSeq_S4' in options else 40 runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) log_file = '{}/logs/workflow.log'.format(output_folder) # Read info from metadata file lanes = [] lanes_unique = [] libraries = [] libraries_unique = [] barcodes = [] bead_structures = [] reference = '' run_barcodematching = False puckcaller_path = '' bead_type = '' sequence = 'AAGCAGTGGTATCAACGCAGAGTGAATGGG' base_quality = '10' min_transcripts_per_cell = '10' email_address = '' experiment_date = '' with open('{}/parsed_metadata.txt'.format(output_folder), 'r') as fin: reader = csv.reader(fin, delimiter='\t') rows = list(reader) row0 = rows[0] for i in range(1, len(rows)): row = rows[i] lanes.append(row[row0.index('lane')]) if row[row0.index('lane')] not in lanes_unique: lanes_unique.append(row[row0.index('lane')]) libraries.append(row[row0.index('library')]) if row[row0.index('library')] not in libraries_unique: libraries_unique.append(row[row0.index('library')]) barcodes.append(row[row0.index('sample_barcode')]) bead_structures.append(row[row0.index('bead_structure')]) if row[row0.index('library')] == library: reference = row[row0.index('reference')] sequence = row[row0.index('start_sequence')] base_quality = row[row0.index('base_quality')] min_transcripts_per_cell = row[row0.index( 'min_transcripts_per_cell')] email_address = row[row0.index('email')] run_barcodematching = str2bool( row[row0.index('run_barcodematching')]) puckcaller_path = row[row0.index('puckcaller_path')] bead_type = row[row0.index('bead_type')] experiment_date = row[row0.index('date')] fin.close() reference_folder = reference[:reference.rfind('/')] referencePure = reference[reference.rfind('/') + 1:] if (referencePure.endswith('.gz')): referencePure = referencePure[:referencePure.rfind('.')] referencePure = referencePure[:referencePure.rfind('.')] genome_dir = '{}/STAR'.format(reference_folder) intervals = '{}/{}.genes.intervals'.format(reference_folder, referencePure) annotations_file = '{}/{}.gtf'.format(reference_folder, referencePure) ref_flat = '{}/{}.refFlat'.format(reference_folder, referencePure) ribosomal_intervals = '{}/{}.rRNA.intervals'.format( reference_folder, referencePure) reference2 = referencePure + '.' + locus_function_list folder_running = '{}/status/running.write_bijective_mapping_{}_{}'.format( output_folder, library, locus_function_list) folder_finished = '{}/status/finished.write_bijective_mapping_{}_{}'.format( output_folder, library, locus_function_list) folder_failed = '{}/status/failed.write_bijective_mapping_{}_{}'.format( output_folder, library, locus_function_list) alignment_folder = '{}/{}_{}/{}/alignment'.format(library_folder, experiment_date, library, reference2) barcode_matching_folder = '{}/{}_{}/{}/barcode_matching'.format( library_folder, experiment_date, library, reference2) dge_gzfile = '{}/{}.digital_expression.txt.gz'.format( alignment_folder, library) dge_file = '{}/{}.digital_expression2.txt'.format(alignment_folder, library) uniqueMappedDge_file = '{}/{}.UniqueMappedDge.txt'.format( alignment_folder, library) MappedDGEForR_file = '{}/MappedDGEForR.csv'.format(alignment_folder) call(['mkdir', '-p', folder_running]) try: now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) # UniqueMappedIlluminaBarcodes bci_file = '{}/{}_barcode_matching.txt'.format(barcode_matching_folder, library) unique_bci_file = '{}/{}_unique_matched_illumina_barcodes.txt'.format( barcode_matching_folder, library) if not os.path.isfile(dge_file): os.system('gunzip -c {} > {}'.format(dge_gzfile, dge_file)) location_file = '{}/{}_matched_bead_locations.txt'.format( barcode_matching_folder, library) genename_file = '{}/{}_genenames.txt'.format(barcode_matching_folder, library) bcb_file = '{}/{}_unique_matched_beads.txt'.format( barcode_matching_folder, library) commandStr = 'perl ' + scripts_folder + '/get_unique_mapped_dge.pl ' + dge_file + ' ' + uniqueMappedDge_file + ' ' + genename_file + ' ' + bcb_file os.system(commandStr) # Call run_WriteBijectiveMapping # 'UniqueMappedBeads','UniqueMappedDGE','UniqueMappedIlluminaBarcodes','GeneNames' output_file = '{}/logs/run_WriteBijectiveMapping_{}_{}.log'.format( output_folder, library, locus_function_list) submission_script = '/broad/macosko/jilong/slideseq_pipeline/scripts/run_WriteBijectiveMapping.sh' call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=65G', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, '/broad/software/nonfree/Linux/redhat_7_x86_64/pkgs/matlab_2019a', scripts_folder, bcb_file, uniqueMappedDge_file, unique_bci_file, genename_file, location_file, puckcaller_path, output_folder ] call_to_taskrunner(output_folder, call_args) commandStr = 'perl ' + scripts_folder + '/txt2csv.pl ' + dge_file + ' ' + MappedDGEForR_file os.system(commandStr) if os.path.isfile(dge_file): call(['rm', dge_file]) now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) call(['mv', folder_running, folder_finished]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) if os.path.isdir(folder_running): call(['mv', folder_running, folder_failed]) elif os.path.isdir(folder_waiting): call(['mv', folder_waiting, folder_failed]) else: call(['mkdir', '-p', folder_failed]) if len(email_address) > 1: subject = "Slide-seq workflow failed for " + flowcell_barcode content = "The Slide-seq workflow for " + library + " " + locus_function_list + " failed at the step of generating BijectiveMapping.mat. Please check the log file for the issues. " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args) sys.exit()
def main(): if len(sys.argv) != 4: print( "Please provide three arguments: manifest file, library ID and locus function list!" ) sys.exit() manifest_file = sys.argv[1] library = sys.argv[2] locus_function_list = sys.argv[3] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file, "r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] library_folder = options[ 'library_folder'] if 'library_folder' in options else '{}/libraries'.format( output_folder) tmpdir = options[ 'temp_folder'] if 'temp_folder' in options else '{}/tmp'.format( output_folder) dropseq_folder = options[ 'dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options[ 'picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' STAR_folder = options[ 'STAR_folder'] if 'STAR_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/STAR-2.5.2a' scripts_folder = options[ 'scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' is_NovaSeq = str2bool( options['is_NovaSeq']) if 'is_NovaSeq' in options else False is_NovaSeq_S4 = str2bool( options['is_NovaSeq_S4']) if 'is_NovaSeq_S4' in options else False num_slice_NovaSeq = int( options['num_slice_NovaSeq']) if 'num_slice_NovaSeq' in options else 10 num_slice_NovaSeq_S4 = int(options['num_slice_NovaSeq_S4'] ) if 'num_slice_NovaSeq_S4' in options else 40 basecalls_dir = '{}/Data/Intensities/BaseCalls'.format(flowcell_directory) runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) log_file = '{}/logs/workflow.log'.format(output_folder) # Read info from metadata file lanes = [] lanes_unique = [] libraries = [] libraries_unique = [] barcodes = [] bead_structures = [] reference = '' run_barcodematching = False puckcaller_path = '' bead_type = '180402' sequence = 'AAGCAGTGGTATCAACGCAGAGTGAATGGG' base_quality = '10' min_transcripts_per_cell = '10' email_address = '' experiment_date = '' gen_downsampling = False with open('{}/parsed_metadata.txt'.format(output_folder), 'r') as fin: reader = csv.reader(fin, delimiter='\t') rows = list(reader) row0 = rows[0] for i in range(1, len(rows)): row = rows[i] lanes.append(row[row0.index('lane')]) if row[row0.index('lane')] not in lanes_unique: lanes_unique.append(row[row0.index('lane')]) libraries.append(row[row0.index('library')]) if row[row0.index('library')] not in libraries_unique: libraries_unique.append(row[row0.index('library')]) barcodes.append(row[row0.index('sample_barcode')]) bead_structures.append(row[row0.index('bead_structure')]) if row[row0.index('library')] == library: reference = row[row0.index('reference')] sequence = row[row0.index('start_sequence')] base_quality = row[row0.index('base_quality')] min_transcripts_per_cell = row[row0.index( 'min_transcripts_per_cell')] email_address = row[row0.index('email')] run_barcodematching = str2bool( row[row0.index('run_barcodematching')]) puckcaller_path = row[row0.index('puckcaller_path')] bead_type = row[row0.index('bead_type')] experiment_date = row[row0.index('date')] if 'gen_downsampling' in row0: gen_downsampling = str2bool( row[row0.index('gen_downsampling')]) fin.close() reference_folder = reference[:reference.rfind('/')] referencePure = reference[reference.rfind('/') + 1:] if (referencePure.endswith('.gz')): referencePure = referencePure[:referencePure.rfind('.')] referencePure = referencePure[:referencePure.rfind('.')] genome_dir = '{}/STAR'.format(reference_folder) intervals = '{}/{}.genes.intervals'.format(reference_folder, referencePure) annotations_file = '{}/{}.gtf'.format(reference_folder, referencePure) ref_flat = '{}/{}.refFlat'.format(reference_folder, referencePure) ribosomal_intervals = '{}/{}.rRNA.intervals'.format( reference_folder, referencePure) reference2 = referencePure + '.' + locus_function_list folder_running = '{}/status/running.analysis_spec_{}_{}'.format( output_folder, library, locus_function_list) folder_finished = '{}/status/finished.analysis_spec_{}_{}'.format( output_folder, library, locus_function_list) folder_failed = '{}/status/failed.analysis_spec_{}_{}'.format( output_folder, library, locus_function_list) analysis_folder = '{}/{}_{}'.format(library_folder, experiment_date, library) alignment_folder = '{}/{}/alignment/'.format(analysis_folder, reference2) barcode_matching_folder = '{}/{}/barcode_matching/'.format( analysis_folder, reference2) combined_bamfile = '{}/{}.bam'.format(analysis_folder, library) call(['mkdir', '-p', folder_running]) try: now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) # Select cells by num transcripts commandStr = dropseq_folder + '/SelectCellsByNumTranscripts ' if is_NovaSeq or is_NovaSeq_S4: commandStr += '-m 24076m I=' + combined_bamfile + ' MIN_TRANSCRIPTS_PER_CELL=' + min_transcripts_per_cell + ' READ_MQ=' + base_quality else: commandStr += '-m 7692m I=' + combined_bamfile + ' MIN_TRANSCRIPTS_PER_CELL=' + min_transcripts_per_cell + ' READ_MQ=' + base_quality commandStr += ' OUTPUT=' + alignment_folder + library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.txt.gz ' commandStr += 'TMP_DIR=' + tmpdir + ' VALIDATION_STRINGENCY=SILENT' if locus_function_list == 'exonic+intronic': commandStr += ' LOCUS_FUNCTION_LIST=INTRONIC' elif locus_function_list == 'intronic': commandStr += ' LOCUS_FUNCTION_LIST=null LOCUS_FUNCTION_LIST=INTRONIC' write_log( log_file, flowcell_barcode, "SelectCellsByNumTranscripts for " + library + " Command=" + commandStr) os.system(commandStr) write_log(log_file, flowcell_barcode, "SelectCellsByNumTranscripts for " + library + " is done. ") # Call run_cmatcher if run_barcodematching: finish_file = '{}/BeadBarcodes_degenerate.finished'.format( analysis_folder) while 1: if os.path.isfile(finish_file): call(['rm', finish_file]) break time.sleep(30) bead_barcode_file = '{}/BeadBarcodes_degenerate.txt'.format( analysis_folder) select_cell_gzfile = alignment_folder + library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.txt.gz' select_cell_file = alignment_folder + library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.txt' name = library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells' name_shuffled = library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.shuffled' os.system('gunzip -c ' + select_cell_gzfile + ' > ' + select_cell_file) select_cell_shuffled_file = alignment_folder + library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.shuffled.txt' with open(select_cell_shuffled_file, 'w') as fout: with open(select_cell_file, 'r') as fin: for line in fin: line = line.strip(' \t\n') items = list(line) random.shuffle(items) bc = ''.join(items) fout.write(bc + '\n') fin.close() fout.close() l = 0 with open(select_cell_file, 'r') as fin: for line in fin: l += 1 fin.close() k = 10000 ls = l // k for i in range(ls + 1): if i * k >= l: break # real barcodes infile2 = '{}/{}_{}.txt'.format(alignment_folder, name, str(i + 1)) commandStr = 'awk \'NR >= {} && NR <= {}\' {} > {}'.format( str(i * k + 1), str((i + 1) * k), select_cell_file, infile2) os.system(commandStr) file4 = '{}/{}_barcode_matching_distance_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) file5 = '{}/{}_barcode_matching_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) output_file = '{}/logs/run_cmatcher_{}_{}_{}.log'.format( output_folder, library, locus_function_list, str(i + 1)) submission_script = '{}/run_cmatcher.sh'.format(scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=30G', '-notify', '-l', 'h_rt=26:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, scripts_folder, bead_barcode_file, infile2, file4, file5, bead_type, output_folder, barcode_matching_folder ] call_to_taskrunner(output_folder, call_args) write_log( log_file, flowcell_barcode, "Run CMatcher for " + library + " " + reference2 + " " + str(i + 1)) # shuffled barcodes infile2 = '{}/{}_{}.txt'.format(alignment_folder, name_shuffled, str(i + 1)) commandStr = 'awk \'NR >= {} && NR <= {}\' {} > {}'.format( str(i * k + 1), str((i + 1) * k), select_cell_shuffled_file, infile2) os.system(commandStr) file4 = '{}/{}_barcode_matching_distance_shuffled_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) file5 = '{}/{}_barcode_matching_shuffled_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) output_file = '{}/logs/run_cmatcher_{}_{}_shuffled_{}.log'.format( output_folder, library, locus_function_list, str(i + 1)) submission_script = '{}/run_cmatcher.sh'.format(scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=30G', '-notify', '-l', 'h_rt=26:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, scripts_folder, bead_barcode_file, infile2, file4, file5, bead_type, output_folder, barcode_matching_folder ] call_to_taskrunner(output_folder, call_args) write_log( log_file, flowcell_barcode, "Run CMatcher for " + library + " " + reference2 + " " + str(i + 1)) # Call run_cmatcher_combine output_file = '{}/logs/run_cmatcher_combine_{}_{}.log'.format( output_folder, library, locus_function_list) submission_script = '{}/run_cmatcher_combine.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=10G', '-notify', '-l', 'h_rt=48:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, locus_function_list, output_folder, '{}/{}'.format(analysis_folder, reference2) ] call_to_taskrunner(output_folder, call_args) # Generate digital expression files for all Illumina barcodes commandStr = dropseq_folder + '/DigitalExpression ' if is_NovaSeq or is_NovaSeq_S4: commandStr += '-m 32268m ' else: commandStr += '-m 7692m ' commandStr += 'I=' + combined_bamfile + ' O=' + alignment_folder + library + '.AllIllumina.digital_expression.txt.gz ' commandStr += 'SUMMARY=' + alignment_folder + library + '.AllIllumina.digital_expression_summary.txt EDIT_DISTANCE=1 READ_MQ=' + base_quality + ' MIN_BC_READ_THRESHOLD=0 ' commandStr += 'CELL_BC_FILE=' + alignment_folder + library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.txt.gz TMP_DIR=' + tmpdir + ' ' commandStr += 'OUTPUT_HEADER=false UEI=' + library + ' VALIDATION_STRINGENCY=SILENT' if locus_function_list == 'exonic+intronic': commandStr += ' LOCUS_FUNCTION_LIST=INTRONIC' elif locus_function_list == 'intronic': commandStr += ' LOCUS_FUNCTION_LIST=null LOCUS_FUNCTION_LIST=INTRONIC' write_log( log_file, flowcell_barcode, "DigitalExpression for " + library + " for all Illumina barcodes Command=" + commandStr) os.system(commandStr) write_log( log_file, flowcell_barcode, "DigitalExpression for " + library + " for all Illumina barcodes is done. ") if gen_downsampling: # Downsample bam downsample_folder = '{}/{}_{}/{}/downsample/'.format( library_folder, experiment_date, library, reference2) call(['mkdir', '-p', downsample_folder]) f1 = '{}/{}.AllIllumina.digital_expression_summary.txt'.format( alignment_folder, library) f2 = '{}/{}_1.digital_expression_summary.txt'.format( downsample_folder, library) call(['cp', f1, f2]) ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] for i in range(0, 9, 1): output_file = '{}/logs/gen_downsample_dge_{}_{}_{}.log'.format( output_folder, library, reference2, str(ratio[i])) submission_script = '{}/gen_downsample_dge.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=47G', '-notify', '-l', 'h_rt=14:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, locus_function_list, str(ratio[i]), output_folder, downsample_folder ] call_to_taskrunner(output_folder, call_args) # Call generate_plot_downsampling output_file = '{}/logs/generate_plot_downsampling_{}_{}.log'.format( output_folder, library, reference2) submission_script = '{}/generate_plot_downsampling.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=10G', '-notify', '-l', 'h_rt=40:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, locus_function_list, output_folder, barcode_matching_folder ] call_to_taskrunner(output_folder, call_args) if not run_barcodematching: if os.path.isdir(barcode_matching_folder): call(['rm', '-r', barcode_matching_folder]) if len(email_address) > 1: subject = "Slide-seq workflow finished for " + flowcell_barcode content = "The Slide-seq workflow for " + library + "_" + locus_function_list + " is finished. Please check the output folder for the results. Thank you for using the Slide-seq tools! " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args) output_file = '{}/logs/give_group_{}_{}.log'.format( output_folder, library, reference2) submission_script = '{}/give_all_group_write.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=5G', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script ] call_to_taskrunner(output_folder, call_args) now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) call(['mv', folder_running, folder_finished]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) if os.path.isdir(folder_running): call(['mv', folder_running, folder_failed]) elif os.path.isdir(folder_waiting): call(['mv', folder_waiting, folder_failed]) else: call(['mkdir', '-p', folder_failed]) if len(email_address) > 1: subject = "Slide-seq workflow failed for " + flowcell_barcode content = "The Slide-seq workflow for " + library + " " + locus_function_list + " failed at the step of running specific analysis. Please check the log file for the issues. " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args) sys.exit()
def main(): if len(sys.argv) != 3: print("Please provide two arguments: manifest file and library ID!") sys.exit() manifest_file = sys.argv[1] library = sys.argv[2] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file,"r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] library_folder = options['library_folder'] if 'library_folder' in options else '{}/libraries'.format(output_folder) tmpdir = options['temp_folder'] if 'temp_folder' in options else '{}/tmp'.format(output_folder) dropseq_folder = options['dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options['picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' STAR_folder = options['STAR_folder'] if 'STAR_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/STAR-2.5.2a' scripts_folder = options['scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' is_NovaSeq = str2bool(options['is_NovaSeq']) if 'is_NovaSeq' in options else False is_NovaSeq_S4 = str2bool(options['is_NovaSeq_S4']) if 'is_NovaSeq_S4' in options else False num_slice_NovaSeq = int(options['num_slice_NovaSeq']) if 'num_slice_NovaSeq' in options else 10 num_slice_NovaSeq_S4 = int(options['num_slice_NovaSeq_S4']) if 'num_slice_NovaSeq_S4' in options else 40 basecalls_dir = '{}/Data/Intensities/BaseCalls'.format(flowcell_directory) runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) log_file = '{}/logs/workflow.log'.format(output_folder) # Read info from metadata file lanes = [] lanes_unique = [] libraries = [] libraries_unique = [] barcodes = [] bead_structures = [] reference = '' locus_function_list = 'exonic+intronic' run_barcodematching = False puckcaller_path = '' bead_type = '180402' email_address = '' experiment_date = '' gen_updistance_plot = False with open('{}/parsed_metadata.txt'.format(output_folder), 'r') as fin: reader = csv.reader(fin, delimiter='\t') rows = list(reader) row0 = rows[0] for i in range(1, len(rows)): row = rows[i] lanes.append(row[row0.index('lane')]) if row[row0.index('lane')] not in lanes_unique: lanes_unique.append(row[row0.index('lane')]) libraries.append(row[row0.index('library')]) if row[row0.index('library')] not in libraries_unique: libraries_unique.append(row[row0.index('library')]) barcodes.append(row[row0.index('sample_barcode')]) bead_structures.append(row[row0.index('bead_structure')]) if row[row0.index('library')] == library: reference = row[row0.index('reference')] locus_function_list = row[row0.index('locus_function_list')] email_address = row[row0.index('email')] run_barcodematching = str2bool(row[row0.index('run_barcodematching')]) puckcaller_path = row[row0.index('puckcaller_path')] bead_type = row[row0.index('bead_type')] experiment_date = row[row0.index('date')] if 'gen_updistance_plot' in row0: gen_updistance_plot = str2bool(row[row0.index('gen_updistance_plot')]) fin.close() # Get tile information from RunInfo.xml slice_id = {} slice_first_tile = {} slice_tile_limit = {} for lane in lanes_unique: tile_nums = get_tiles(runinfo_file, lane) tile_cou = len(tile_nums) if ((not is_NovaSeq) and (not is_NovaSeq_S4)): slice_id[lane] = ['0'] slice_first_tile[lane] = [str(tile_nums[0])] slice_tile_limit[lane] = [str(tile_cou)] else: slice_cou = num_slice_NovaSeq if is_NovaSeq else num_slice_NovaSeq_S4 tile_cou_per_slice = (tile_cou // slice_cou) + 1 slice_id[lane] = [] slice_first_tile[lane] = [] slice_tile_limit[lane] = [] for i in range(slice_cou): if (tile_cou_per_slice * i >= tile_cou): break slice_id[lane].append(str(i)) slice_first_tile[lane].append(str(tile_nums[tile_cou_per_slice * i])) slice_tile_limit[lane].append(str(tile_cou_per_slice)) folder_waiting = '{}/status/waiting.analysis_{}'.format(output_folder, library) folder_running = '{}/status/running.analysis_{}'.format(output_folder, library) folder_finished = '{}/status/finished.analysis_{}'.format(output_folder, library) folder_failed = '{}/status/failed.analysis_{}'.format(output_folder, library) analysis_folder = '{}/{}_{}'.format(library_folder, experiment_date, library) call(['mkdir', '-p', folder_waiting]) if run_barcodematching: file2 = '{}/BeadBarcodes.txt'.format(puckcaller_path) file3 = '{}/BeadLocations.txt'.format(puckcaller_path) while 1: if os.path.isfile(file2) and os.path.isfile(file3): break time.sleep(30) call(['cp', file2, analysis_folder+'/']) call(['cp', file3, analysis_folder+'/']) bead_barcode_file = '{}/BeadBarcodes.txt'.format(analysis_folder) bead_location_file = '{}/BeadLocations.txt'.format(analysis_folder) l = 0 with open(bead_barcode_file, 'r') as fin: for line in fin: l += 1 fin.close() k = 10000 ls = l // k for i in range(ls + 1): if i * k >= l: break; infile2 = '{}/BeadBarcodes_{}.txt'.format(analysis_folder, str(i + 1)) commandStr = 'awk \'NR >= {} && NR <= {}\' {} > {}'.format(str(i * k + 1), str((i+1) * k), bead_barcode_file, infile2) os.system(commandStr) file4 = '{}/{}_barcode_matching_01_{}.txt'.format(analysis_folder, library, str(i + 1)) file5 = '{}/{}_barcode_matching_2_{}.txt'.format(analysis_folder, library, str(i + 1)) output_file = '{}/logs/run_cmatcher_beads_{}.log'.format(output_folder, str(i + 1)) submission_script = '{}/run_cmatcher_beads.sh'.format(scripts_folder) call_args = ['qsub', '-o', output_file, '-l', 'h_vmem=5G', '-notify', '-l', 'h_rt=5:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, scripts_folder, infile2, bead_barcode_file, bead_location_file, file4, file5, bead_type, output_folder, analysis_folder] call_to_taskrunner(output_folder, call_args) # Call run_cmatcher_beads_combine output_file = '{}/logs/run_cmatcher_beads_combine_{}.log'.format(output_folder, library) submission_script = '{}/run_cmatcher_beads_combine.sh'.format(scripts_folder) call_args = ['qsub', '-o', output_file, '-l', 'h_vmem=5G', '-notify', '-l', 'h_rt=50:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, output_folder, analysis_folder] call_to_taskrunner(output_folder, call_args) # Wait for all of run_alignment finish failed_list = [] while 1: f = True for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: fol1 = '{}/status/finished.alignment_{}_{}_{}_{}'.format(output_folder, library, lanes[i], slice, barcodes[i]) fol2 = '{}/status/failed.alignment_{}_{}_{}_{}'.format(output_folder, library, lanes[i], slice, barcodes[i]) if (not os.path.isdir(fol1)) and (not os.path.isdir(fol2)): f = False prefix_libraries = '{}/{}.{}.{}.{}'.format(analysis_folder, flowcell_barcode, lanes[i], slice, library) if (barcodes[i]): prefix_libraries += '.'+barcodes[i] star_bamfile = prefix_libraries + '.star_gene_exon_tagged2.bam' if (os.path.isdir(fol1) or os.path.isdir(fol2)) and (not os.path.isfile(star_bamfile)): if star_bamfile not in failed_list: failed_list.append(star_bamfile) if os.path.isdir(fol1): call(['rm', '-r', fol1]) if os.path.isdir(fol2): call(['rm', '-r', fol2]) if os.path.isfile(prefix_libraries+'.star.Log.final.out'): call(['rm', prefix_libraries+'.star.Log.final.out']) if os.path.isfile(prefix_libraries+'.star.Log.out'): call(['rm', prefix_libraries+'.star.Log.out']) if os.path.isfile(prefix_libraries+'.star.Log.progress.out'): call(['rm', prefix_libraries+'.star.Log.progress.out']) if os.path.isfile(prefix_libraries+'.star.SJ.out.tab'): call(['rm', prefix_libraries+'.star.SJ.out.tab']) if os.path.isdir(prefix_libraries+'.star._STARtmp'): call(['rm', '-r', prefix_libraries+'.star._STARtmp']) output_file = '{}/logs/run_alignment_{}_{}_{}.log'.format(output_folder, library, lanes[i], slice) submission_script = '{}/run_alignment.sh'.format(scripts_folder) call_args = ['qsub', '-o', output_file, '-l', 'h_vmem=60G', '-notify', '-l', 'h_rt=21:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, lanes[i], slice, barcodes[i], scripts_folder, output_folder, analysis_folder] call_to_taskrunner(output_folder, call_args) f = False else: write_log(log_file, flowcell_barcode, 'MergeSamFiles error: '+star_bamfile+' does not exist!') raise Exception(star_bamfile + ' does not exist!') if f: break time.sleep(60) if os.path.isdir(folder_waiting): call(['mv', folder_waiting, folder_running]) else: call(['mkdir', '-p', folder_running]) try: now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) # Merge bam files combined_bamfile = '{}/{}.bam'.format(analysis_folder, library) commandStr = 'java -Djava.io.tmpdir='+tmpdir+' -Dsamjdk.buffer_size=131072 -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8192m ' commandStr += '-jar '+picard_folder+'/picard.jar MergeSamFiles TMP_DIR='+tmpdir+' CREATE_INDEX=true CREATE_MD5_FILE=false VALIDATION_STRINGENCY=SILENT ' commandStr += 'OUTPUT='+combined_bamfile+' SORT_ORDER=coordinate ASSUME_SORTED=true' for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: star_bamfile = '{}/{}.{}.{}.{}'.format(analysis_folder, flowcell_barcode, lanes[i], slice, library) if (barcodes[i]): star_bamfile += '.'+barcodes[i] star_bamfile += '.star_gene_exon_tagged2.bam' if not os.path.isfile(star_bamfile): write_log(log_file, flowcell_barcode, 'MergeSamFiles error: '+star_bamfile+' does not exist!') raise Exception(star_bamfile + ' does not exist!') commandStr += ' INPUT='+star_bamfile write_log(log_file, flowcell_barcode, "MergeSamFiles for "+library+" Command="+commandStr) os.system(commandStr) write_log(log_file, flowcell_barcode, "MergeSamFiles for "+library+" is done. ") # Validate bam file commandStr = 'java -Djava.io.tmpdir='+tmpdir+' -Dsamjdk.buffer_size=131072 -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx16384m ' commandStr += '-jar '+picard_folder+'/picard.jar ValidateSamFile TMP_DIR='+tmpdir+' VALIDATION_STRINGENCY=SILENT ' commandStr += 'INPUT='+combined_bamfile+' MODE=SUMMARY' if (not is_NovaSeq) and (not is_NovaSeq_S4): commandStr += ' IGNORE=MISSING_PLATFORM_VALUE IGNORE=INVALID_VERSION_NUMBER' write_log(log_file, flowcell_barcode, "ValidateSamFile for "+library+" Command="+commandStr) os.system(commandStr) write_log(log_file, flowcell_barcode, "ValidateSamFile for "+library+" is done. ") # Call generate_plots output_file = '{}/logs/generate_plots_{}.log'.format(output_folder, library) submission_script = '{}/generate_plots.sh'.format(scripts_folder) call_args = ['qsub', '-o', output_file, '-l', 'h_vmem=55G', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, output_folder, analysis_folder] call_to_taskrunner(output_folder, call_args) lists = locus_function_list.split(',') referencePure = reference[reference.rfind('/') + 1:] if (referencePure.endswith('.gz')): referencePure = referencePure[:referencePure.rfind('.')] referencePure = referencePure[:referencePure.rfind('.')] for l in lists: call(['mkdir', '-p', '{}/{}.{}'.format(analysis_folder, referencePure, l)]) call(['mkdir', '-p', '{}/{}.{}/alignment'.format(analysis_folder, referencePure, l)]) if run_barcodematching: barcode_matching_folder = '{}/{}.{}/barcode_matching/'.format(analysis_folder, referencePure, l) call(['mkdir', '-p', barcode_matching_folder]) for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: toCopyFile = '{}/{}.{}.{}.{}'.format(analysis_folder, flowcell_barcode, lanes[i], slice, library) if (barcodes[i]): toCopyFile += '.'+barcodes[i] toCopyFile += '.star_gene_exon_tagged2.bam' if os.path.isfile(toCopyFile): call(['cp', toCopyFile, barcode_matching_folder]) # Call run_analysis_spec output_file = '{}/logs/run_analysis_spec_{}_{}.log'.format(output_folder, library, l) submission_script = '{}/run_analysis_spec.sh'.format(scripts_folder) call_args = ['qsub', '-o', output_file, '-l', 'h_vmem=60G', '-notify', '-l', 'h_rt=24:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, l, output_folder, '{}/{}.{}'.format(analysis_folder, referencePure, l)] call_to_taskrunner(output_folder, call_args) for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: toDeleteFile = '{}/{}.{}.{}.{}'.format(analysis_folder, flowcell_barcode, lanes[i], slice, library) if (barcodes[i]): toDeleteFile += '.'+barcodes[i] toDeleteFile += '.star_gene_exon_tagged2.bam' if os.path.isfile(toDeleteFile): call(['rm', toDeleteFile]) # Combine check_alignments_quality files dict_unique_score = {} dict_multi_score = {} dict_unique_mismatch = {} dict_multi_mismatch = {} dict_unique_ratio = {} dict_multi_ratio = {} for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: star_samfile = '{}/{}.{}.{}.{}'.format(analysis_folder, flowcell_barcode, lanes[i], slice, library) if (barcodes[i]): star_samfile += '.'+barcodes[i] star_samfile += '.star.Aligned.out.sam' file1 = star_samfile + ".unique.score"; file2 = star_samfile + ".multi.score"; file3 = star_samfile + ".unique.mismatch"; file4 = star_samfile + ".multi.mismatch"; file5 = star_samfile + ".unique.ratio"; file6 = star_samfile + ".multi.ratio"; if os.path.isfile(file1): with open(file1, 'r') as fin: for line in fin: c1 = line.split('\t')[0].strip(' \t\n') c2 = int(line.split('\t')[1].strip(' \t\n')) if not c1 in dict_unique_score: dict_unique_score[c1] = c2 else: dict_unique_score[c1] += c2 fin.close() if os.path.isfile(file2): with open(file2, 'r') as fin: for line in fin: c1 = line.split('\t')[0].strip(' \t\n') c2 = int(line.split('\t')[1].strip(' \t\n')) if not c1 in dict_multi_score: dict_multi_score[c1] = c2 else: dict_multi_score[c1] += c2 fin.close() if os.path.isfile(file3): with open(file3, 'r') as fin: for line in fin: c1 = line.split('\t')[0].strip(' \t\n') c2 = int(line.split('\t')[1].strip(' \t\n')) if not c1 in dict_unique_mismatch: dict_unique_mismatch[c1] = c2 else: dict_unique_mismatch[c1] += c2 fin.close() if os.path.isfile(file4): with open(file4, 'r') as fin: for line in fin: c1 = line.split('\t')[0].strip(' \t\n') c2 = int(line.split('\t')[1].strip(' \t\n')) if not c1 in dict_multi_mismatch: dict_multi_mismatch[c1] = c2 else: dict_multi_mismatch[c1] += c2 fin.close() if os.path.isfile(file5): with open(file5, 'r') as fin: for line in fin: c1 = line.split('\t')[0].strip(' \t\n') c2 = int(line.split('\t')[1].strip(' \t\n')) if not c1 in dict_unique_ratio: dict_unique_ratio[c1] = c2 else: dict_unique_ratio[c1] += c2 fin.close() if os.path.isfile(file6): with open(file6, 'r') as fin: for line in fin: c1 = line.split('\t')[0].strip(' \t\n') c2 = int(line.split('\t')[1].strip(' \t\n')) if not c1 in dict_multi_ratio: dict_multi_ratio[c1] = c2 else: dict_multi_ratio[c1] += c2 fin.close() call(['rm', file1]) call(['rm', file2]) call(['rm', file3]) call(['rm', file4]) call(['rm', file5]) call(['rm', file6]) outfile1 = '{}/{}.unique.score'.format(analysis_folder, library) outfile2 = '{}/{}.multi.score'.format(analysis_folder, library) outfile3 = '{}/{}.unique.mismatch'.format(analysis_folder, library) outfile4 = '{}/{}.multi.mismatch'.format(analysis_folder, library) outfile5 = '{}/{}.unique.ratio'.format(analysis_folder, library) outfile6 = '{}/{}.multi.ratio'.format(analysis_folder, library) with open(outfile1, 'w') as fout: for k in dict_unique_score: fout.write(k + '\t' + str(dict_unique_score[k]) + '\n') fout.close() with open(outfile2, 'w') as fout: for k in dict_multi_score: fout.write(k + '\t' + str(dict_multi_score[k]) + '\n') fout.close() with open(outfile3, 'w') as fout: for k in dict_unique_mismatch: fout.write(k + '\t' + str(dict_unique_mismatch[k]) + '\n') fout.close() with open(outfile4, 'w') as fout: for k in dict_multi_mismatch: fout.write(k + '\t' + str(dict_multi_mismatch[k]) + '\n') fout.close() with open(outfile5, 'w') as fout: for k in dict_unique_ratio: fout.write(k + '\t' + str(dict_unique_ratio[k]) + '\n') fout.close() with open(outfile6, 'w') as fout: for k in dict_multi_ratio: fout.write(k + '\t' + str(dict_multi_ratio[k]) + '\n') fout.close() # plot commandStr = 'python {}/plot_alignment_histogram.py {} {} {}'.format(scripts_folder, analysis_folder, library, library) os.system(commandStr) # Summary mapping rate totalreads = 0 uniquereads = 0 multireads = 0 toomanyreads = 0 for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: log_file = '{}/{}/{}/{}/{}/{}.{}.{}.{}.{}.star.Log.final.out'.format(output_folder, lanes[i], slice, library, barcodes[i], flowcell_barcode, lanes[i], slice, library, barcodes[i]) if not os.path.isfile(log_file): continue with open(log_file, "r") as f3: for line3 in f3: if get_key(line3) == 'Number of input reads': totalreads += int(get_val(line3)) if get_key(line3) == 'Uniquely mapped reads number': uniquereads += int(get_val(line3)) if get_key(line3) == 'Number of reads mapped to multiple loci': multireads += int(get_val(line3)) if get_key(line3) == 'Number of reads mapped to too many loci': toomanyreads += int(get_val(line3)) f3.close() mismatch1 = 0 mismatch2 = 0 mismatch3 = 0 if '1' in dict_unique_mismatch: mismatch1 += dict_unique_mismatch['1'] if '1' in dict_multi_mismatch: mismatch1 += dict_multi_mismatch['1'] if '2' in dict_unique_mismatch: mismatch2 += dict_unique_mismatch['2'] if '2' in dict_multi_mismatch: mismatch2 += dict_multi_mismatch['2'] if '3' in dict_unique_mismatch: mismatch3 += dict_unique_mismatch['3'] if '3' in dict_multi_mismatch: mismatch3 += dict_multi_mismatch['3'] output_file = '{}/{}_mapping_rate.txt'.format(analysis_folder, library) fout = open(output_file, 'w') fout.write('library\t{}\n'.format(library)) fout.write('total_reads\t{}\n'.format(totalreads)) fout.write('unique_aligned_reads\t{}\n'.format(uniquereads)) fout.write('unique_aligned_ratio\t{}\n'.format('{0:.3g}'.format(100*uniquereads/totalreads))) fout.write('multi_aligned_reads\t{}\n'.format(multireads)) fout.write('multi_aligned_ratio\t{}\n'.format('{0:.3g}'.format(100*multireads/totalreads))) fout.write('too_many_aligned_reads\t{}\n'.format(toomanyreads)) fout.write('too_many_aligned_ratio\t{}\n'.format('{0:.3g}'.format(100*toomanyreads/totalreads))) fout.write('mismatch1_rate\t{}\n'.format('{0:.3g}'.format(100*mismatch1/totalreads))) fout.write('mismatch2_rate\t{}\n'.format('{0:.3g}'.format(100*mismatch2/totalreads))) fout.write('mismatch3_rate\t{}\n'.format('{0:.3g}'.format(100*mismatch3/totalreads))) fout.close() if gen_updistance_plot: for i in range(len(lanes)): if (libraries[i] != library): continue read1_file = '{}/{}.{}.read1.fastq'.format(analysis_folder, library, lanes[i]) read2_file = '{}/{}.{}.read2.fastq'.format(analysis_folder, library, lanes[i]) combined_bamfile = '{}/{}.{}.unmapped.bam'.format(analysis_folder, library, lanes[i]) combined_baifile = '{}/{}.{}.unmapped.bai'.format(analysis_folder, library, lanes[i]) commandStr = 'java -Djava.io.tmpdir='+tmpdir+' -Dsamjdk.buffer_size=131072 -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8192m ' commandStr += '-jar '+picard_folder+'/picard.jar MergeSamFiles TMP_DIR='+tmpdir+' CREATE_INDEX=true CREATE_MD5_FILE=false VALIDATION_STRINGENCY=SILENT ' commandStr += 'OUTPUT='+combined_bamfile+' SORT_ORDER=coordinate ASSUME_SORTED=true' for slice in slice_id[lanes[i]]: bamfile = '{}/{}.{}.{}.{}'.format(analysis_folder, flowcell_barcode, lanes[i], slice, library) if (barcodes[i]): bamfile += '.'+barcodes[i] bamfile += '.unmapped.bam' if not os.path.isfile(bamfile): write_log(log_file, flowcell_barcode, 'MergeSamFiles error: '+bamfile+' does not exist!') raise Exception(bamfile + ' does not exist!') commandStr += ' INPUT='+bamfile os.system(commandStr) # Convert bam to fastq commandStr = 'java -Djava.io.tmpdir='+tmpdir+' -Xmx500m -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 ' commandStr += '-jar '+picard_folder+'/picard.jar SamToFastq I='+combined_bamfile+' F='+read1_file+' F2='+read2_file+' VALIDATION_STRINGENCY=SILENT' os.system(commandStr) if os.path.isfile(combined_bamfile): call(['rm', combined_bamfile]) if os.path.isfile(combined_baifile): call(['rm', combined_baifile]) if os.path.isfile(read2_file): call(['rm', read2_file]) output_file = '{}/logs/run_analysis_UPdistance_{}_{}.log'.format(output_folder, library, lanes[i]) submission_script = '{}/run_analysis_UPdistance.sh'.format(scripts_folder) call_args = ['qsub', '-o', output_file, '-l', 'h_vmem=35G', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, lanes[i], scripts_folder, output_folder, analysis_folder] call_to_taskrunner(output_folder, call_args) break now = datetime.now() dt_string = now.strftime("%Y-%m-%d %H:%M:%S") print(dt_string) call(['mv', folder_running, folder_finished]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) if os.path.isdir(folder_running): call(['mv', folder_running, folder_failed]) elif os.path.isdir(folder_waiting): call(['mv', folder_waiting, folder_failed]) else: call(['mkdir', '-p', folder_failed]) if len(email_address) > 1: subject = "Slide-seq workflow failed for " + flowcell_barcode content = "The Slide-seq workflow for "+library+" failed at the step of running analysis. Please check the log file for the issues. " call_args = ['python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content] call(call_args) sys.exit()
# Call run pipeline if args.resubmit: output_file = '{}/logs/run_mergebarcodes.log'.format(output_dir) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=10G', '-notify', '-l', 'h_rt=90:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, scripts_folder, output_dir ] else: output_file = '{}/logs/run_pipeline.log'.format(output_dir) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=10G', '-notify', '-l', 'h_rt=5:00:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, scripts_folder, output_dir ] print('Command issued:') print(' '.join(call_args)) if not args.dryrun: call_to_taskrunner(output_dir, call_args) submitted.append(flowcell) print('Flowcells {} submitted for processing'.format(' '.join(submitted))) skipped = [flowcell for flowcell in flowcells if flowcell not in submitted] if skipped: print( '\nFlowcells {} were skipped -- please see warnings above.'.format( '_'.join(skipped)))
def main(): if len(sys.argv) != 4: print( "Please provide three arguments: manifest file, library ID and locus function list!" ) sys.exit() manifest_file = sys.argv[1] library = sys.argv[2] locus_function_list = sys.argv[3] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file, "r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] library_folder = options[ 'library_folder'] if 'library_folder' in options else '{}/libraries'.format( output_folder) tmpdir = options[ 'temp_folder'] if 'temp_folder' in options else '{}/tmp'.format( output_folder) dropseq_folder = options[ 'dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options[ 'picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' STAR_folder = options[ 'STAR_folder'] if 'STAR_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/STAR-2.5.2a' scripts_folder = options[ 'scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' is_NovaSeq = str2bool( options['is_NovaSeq']) if 'is_NovaSeq' in options else False is_NovaSeq_S4 = str2bool( options['is_NovaSeq_S4']) if 'is_NovaSeq_S4' in options else False num_slice_NovaSeq = int( options['num_slice_NovaSeq']) if 'num_slice_NovaSeq' in options else 10 num_slice_NovaSeq_S4 = int(options['num_slice_NovaSeq_S4'] ) if 'num_slice_NovaSeq_S4' in options else 40 # Read info from metadata file lanes = [] lanes_unique = [] libraries = [] libraries_unique = [] barcodes = [] bead_structures = [] reference = '' base_quality = '10' min_transcripts_per_cell = '10' email_address = '' bead_type = '180402' bead_structure = '' run_puckmatcher = False experiment_date = '' gen_read1_plot = False with open('{}/parsed_metadata.txt'.format(output_folder), 'r') as fin: reader = csv.reader(fin, delimiter='\t') rows = list(reader) row0 = rows[0] for i in range(1, len(rows)): row = rows[i] lanes.append(row[row0.index('lane')]) if row[row0.index('lane')] not in lanes_unique: lanes_unique.append(row[row0.index('lane')]) libraries.append(row[row0.index('library')]) if row[row0.index('library')] not in libraries_unique: libraries_unique.append(row[row0.index('library')]) barcodes.append(row[row0.index('sample_barcode')]) bead_structures.append(row[row0.index('bead_structure')]) if row[row0.index('library')] == library: reference = row[row0.index('reference')] base_quality = row[row0.index('base_quality')] min_transcripts_per_cell = row[row0.index( 'min_transcripts_per_cell')] email_address = row[row0.index('email')] bead_type = row[row0.index('bead_type')] bead_structure = row[row0.index('bead_structure')] run_puckmatcher = str2bool( row[row0.index('run_barcodematching')]) experiment_date = row[row0.index('date')] if 'gen_read1_plot' in row0: gen_read1_plot = str2bool( row[row0.index('gen_read1_plot')]) fin.close() reference_folder = reference[:reference.rfind('/')] referencePure = reference[reference.rfind('/') + 1:] if (referencePure.endswith('.gz')): referencePure = referencePure[:referencePure.rfind('.')] referencePure = referencePure[:referencePure.rfind('.')] genome_dir = '{}/STAR'.format(reference_folder) intervals = '{}/{}.genes.intervals'.format(reference_folder, referencePure) annotations_file = '{}/{}.gtf'.format(reference_folder, referencePure) ref_flat = '{}/{}.refFlat'.format(reference_folder, referencePure) ribosomal_intervals = '{}/{}.rRNA.intervals'.format( reference_folder, referencePure) reference2 = referencePure + '.' + locus_function_list runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) log_file = '{}/logs/workflow.log'.format(output_folder) # Get tile information from RunInfo.xml slice_id = {} slice_first_tile = {} slice_tile_limit = {} for lane in lanes_unique: tile_nums = get_tiles(runinfo_file, lane) tile_cou = len(tile_nums) if ((not is_NovaSeq) and (not is_NovaSeq_S4)): slice_id[lane] = ['0'] slice_first_tile[lane] = [str(tile_nums[0])] slice_tile_limit[lane] = [str(tile_cou)] else: slice_cou = num_slice_NovaSeq if is_NovaSeq else num_slice_NovaSeq_S4 tile_cou_per_slice = (tile_cou // slice_cou) + 1 slice_id[lane] = [] slice_first_tile[lane] = [] slice_tile_limit[lane] = [] for i in range(slice_cou): if tile_cou_per_slice * i >= tile_cou: break slice_id[lane].append(str(i)) slice_first_tile[lane].append( str(tile_nums[tile_cou_per_slice * i])) slice_tile_limit[lane].append(str(tile_cou_per_slice)) analysis_folder = '{}/{}_{}'.format(library_folder, experiment_date, library) alignment_folder = '{}/{}/alignment/'.format(analysis_folder, reference2) barcode_matching_folder = '{}/{}/barcode_matching/'.format( analysis_folder, reference2) select_cell_file = alignment_folder + library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.txt' bead_barcode_file = '{}/BeadBarcodes_degenerate.txt'.format( analysis_folder) if not os.path.isfile(select_cell_file): write_log( log_file, flowcell_barcode, 'run_cmatcher_combine error: ' + select_cell_file + ' does not exist!') raise Exception('run_cmatcher_combine error: ' + select_cell_file + ' does not exist!') folder_running = '{}/status/running.cmatcher_combine_{}_{}'.format( output_folder, library, reference2) folder_finished = '{}/status/finished.cmatcher_combine_{}_{}'.format( output_folder, library, reference2) folder_failed = '{}/status/failed.cmatcher_combine_{}_{}'.format( output_folder, library, reference2) try: call(['mkdir', '-p', folder_running]) l = 0 with open(select_cell_file, 'r') as fin: for line in fin: l += 1 fin.close() k = 10000 ls = l // k print('# selected cells: ' + str(l)) while 1: f = True for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_{}.finished'.format( barcode_matching_folder, library, str(i + 1)) if not os.path.isfile(file2): f = False break if f: break time.sleep(30) while 1: f = True for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_shuffled_{}.finished'.format( barcode_matching_folder, library, str(i + 1)) if not os.path.isfile(file2): f = False break if f: break time.sleep(30) print('combine cmatcher outputs...') write_log(log_file, flowcell_barcode, "Combine CMatcher outputs for " + library + " " + reference2) combined_cmatcher_file = '{}/{}_barcode_matching.txt'.format( barcode_matching_folder, library) with open(combined_cmatcher_file, 'w') as fout: fout.write( 'IlluminaBarcodes\tProcessedIlluminaBarcodes\tBeadBarcodes\tDistance\tX\tY\n' ) for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) with open(file2, 'r') as fin: j = 0 for line in fin: j += 1 if j > 1: fout.write(line) fin.close() fout.close() # Combine CMatcher logs combined_cmatcher_summary = '{}/{}_barcode_matching_summary.txt'.format( barcode_matching_folder, library) total = 0 unique = 0 multi = 0 for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_{}.txt.log'.format( barcode_matching_folder, library, str(i + 1)) if not os.path.isfile(file2): continue j = 0 with open(file2, 'r') as fin: for line in fin: j += 1 s = line.split(':')[1] s = s.strip(' \t\n') if j == 1: total += int(s) elif j == 2: unique += int(s) elif j == 3: multi += int(s) fin.close() with open(combined_cmatcher_summary, 'w') as fout: fout.write('Total # barcodes: {}\n'.format(str(total))) fout.write('# unique matched barcodes: {}, {}%\n'.format( str(unique), str(unique * 100 / total))) fout.write('# multiple matched barcodes: {}, {}%\n'.format( str(multi), str(multi * 100 / total))) fout.close() for i in range(ls + 1): if i * k >= l: break file1 = '{}/{}_barcode_matching_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) file2 = '{}/{}_barcode_matching_{}.finished'.format( barcode_matching_folder, library, str(i + 1)) name = library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells' file3 = '{}/{}_{}.txt'.format(alignment_folder, name, str(i + 1)) file4 = '{}/{}_barcode_matching_{}.txt.log'.format( barcode_matching_folder, library, str(i + 1)) if os.path.isfile(file1): call(['rm', file1]) if os.path.isfile(file2): call(['rm', file2]) if os.path.isfile(file3): call(['rm', file3]) if os.path.isfile(file4): call(['rm', file4]) combined_cmatcher_file2 = '{}/{}_barcode_matching_distance.txt'.format( barcode_matching_folder, library) with open(combined_cmatcher_file2, 'w') as fout: fout.write( 'IlluminaBarcodes\tProcessedIlluminaBarcodes\tBeadBarcodes\tDistance\tX\tY\n' ) for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_distance_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) with open(file2, 'r') as fin: j = 0 for line in fin: j += 1 if j > 1: fout.write(line) fin.close() call(['rm', file2]) fout.close() # UniqueMappedIlluminaBarcodes bci = np.loadtxt(combined_cmatcher_file, delimiter='\t', dtype='str', skiprows=1, usecols=(1)) bci = np.unique(bci) unique_bci_file = '{}/{}_unique_matched_illumina_barcodes.txt'.format( barcode_matching_folder, library) with open(unique_bci_file, 'w') as f1: for bc in bci: f1.write("%s\n" % bc) f1.close() os.system('gzip -c ' + unique_bci_file + ' > ' + unique_bci_file + '.gz') write_log( log_file, flowcell_barcode, "Combine CMatcher outputs for " + library + " " + reference2 + " is done. ") # Get unique matched bead barcodes and locations print('Get unique matched bead barcodes and locations...') write_log( log_file, flowcell_barcode, "Get unique matched bead barcodes and locations for " + library + " " + reference2) dict = {} matched_bead_barcode_file = '{}/{}_matched_bead_barcodes.txt'.format( barcode_matching_folder, library) matched_bead_location_file = '{}/{}_matched_bead_locations.txt'.format( barcode_matching_folder, library) bead_location_forR = '{}/BeadLocationsForR.csv'.format( barcode_matching_folder) with open(matched_bead_barcode_file, 'w') as fout1: with open(matched_bead_location_file, 'w') as fout2: with open(bead_location_forR, 'w') as fout3: fout3.write('barcodes,xcoord,ycoord\n') with open(combined_cmatcher_file, 'r') as fin: j = 0 for line in fin: j += 1 if j > 1: bc = line.split('\t')[2] dist = line.split('\t')[3] x = line.split('\t')[4] y = line.split('\t')[5] if not bc in dict: fout1.write(bc + '\n') fout2.write(dist + '\t' + x + '\t' + y) fout3.write(bc + ',' + x + ',' + y) dict[bc] = 1 fin.close() fout3.close() fout2.close() fout1.close() write_log( log_file, flowcell_barcode, "Get unique matched bead barcodes and locations for " + library + " " + reference2 + " is done. ") combined_cmatcher_file = '{}/{}_barcode_matching_shuffled.txt'.format( barcode_matching_folder, library) with open(combined_cmatcher_file, 'w') as fout: fout.write( 'IlluminaBarcodes\tProcessedIlluminaBarcodes\tBeadBarcodes\tDistance\tX\tY\n' ) for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_shuffled_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) with open(file2, 'r') as fin: j = 0 for line in fin: j += 1 if j > 1: fout.write(line) fin.close() fout.close() for i in range(ls + 1): if i * k >= l: break file1 = '{}/{}_barcode_matching_shuffled_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) file2 = '{}/{}_barcode_matching_shuffled_{}.finished'.format( barcode_matching_folder, library, str(i + 1)) name = library + '.' + min_transcripts_per_cell + '_transcripts_mq_' + base_quality + '_selected_cells.shuffled' file3 = '{}/{}_{}.txt'.format(alignment_folder, name, str(i + 1)) file4 = '{}/{}_barcode_matching_shuffled_{}.txt.log'.format( barcode_matching_folder, library, str(i + 1)) if os.path.isfile(file1): call(['rm', file1]) if os.path.isfile(file2): call(['rm', file2]) if os.path.isfile(file3): call(['rm', file3]) if os.path.isfile(file4): call(['rm', file4]) combined_cmatcher_file2 = '{}/{}_barcode_matching_distance_shuffled.txt'.format( barcode_matching_folder, library) with open(combined_cmatcher_file2, 'w') as fout: fout.write( 'IlluminaBarcodes\tProcessedIlluminaBarcodes\tBeadBarcodes\tDistance\tX\tY\n' ) for i in range(ls + 1): if i * k >= l: break file2 = '{}/{}_barcode_matching_distance_shuffled_{}.txt'.format( barcode_matching_folder, library, str(i + 1)) with open(file2, 'r') as fin: j = 0 for line in fin: j += 1 if j > 1: fout.write(line) fin.close() call(['rm', file2]) fout.close() # UniqueMappedIlluminaBarcodes bci = np.loadtxt(combined_cmatcher_file, delimiter='\t', dtype='str', skiprows=1, usecols=(1)) bci = np.unique(bci) shuffled_bci_file = '{}/{}_unique_shuffled_illumina_barcodes.txt'.format( barcode_matching_folder, library) with open(shuffled_bci_file, 'w') as f1: for bc in bci: f1.write("%s\n" % bc) f1.close() os.system('gzip -c ' + shuffled_bci_file + ' > ' + shuffled_bci_file + '.gz') for i in range(len(lanes)): if libraries[i] != library: continue for slice in slice_id[lanes[i]]: # Call tag_matched_bam output_file = '{}/logs/tag_matched_bam_{}_{}_{}_{}_{}.log'.format( output_folder, library, lanes[i], slice, barcodes[i], reference2) submission_script = '{}/tag_matched_bam.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=5G', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, lanes[i], slice, barcodes[i], locus_function_list, scripts_folder, output_folder, analysis_folder ] call_to_taskrunner(output_folder, call_args) # Call filter_unmapped_bam if gen_read1_plot: output_file = '{}/logs/filter_unmapped_bam_{}_{}_{}_{}_{}.log'.format( output_folder, library, lanes[i], slice, barcodes[i], reference2) submission_script = '{}/filter_unmapped_bam.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=10G', '-notify', '-l', 'h_rt=10:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, lanes[i], slice, barcodes[i], locus_function_list, scripts_folder, output_folder, analysis_folder ] call_to_taskrunner(output_folder, call_args) # Call generate_plots_cmatcher output_file = '{}/logs/generate_plots_cmatcher_{}_{}.log'.format( output_folder, library, reference2) submission_script = '{}/generate_plots_cmatcher.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=30G', '-notify', '-l', 'h_rt=40:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, library, scripts_folder, locus_function_list, output_folder, '{}/{}'.format(analysis_folder, reference2) ] call_to_taskrunner(output_folder, call_args) call(['mv', folder_running, folder_finished]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) if os.path.isdir(folder_running): call(['mv', folder_running, folder_failed]) else: call(['mkdir', '-p', folder_failed]) if len(email_address) > 1: subject = "Slide-seq workflow failed for " + flowcell_barcode content = "The Slide-seq workflow for " + library + " " + reference2 + " failed at the step of running cmatcher combine. Please check the log file for the issues. " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args) sys.exit()
def main(): if len(sys.argv) != 2: print("Please provide one argument: manifest file!") sys.exit() manifest_file = sys.argv[1] # Check if the manifest file exists if not os.path.isfile(manifest_file): print("File {} does not exist. Exiting...".format(manifest_file)) sys.exit() # Read manifest file options = {} with open(manifest_file, "r") as fp: for line in fp: dict = line.rstrip().split("=") options[dict[0]] = dict[1] fp.close() flowcell_directory = options['flowcell_directory'] output_folder = options['output_folder'] metadata_file = options['metadata_file'] flowcell_barcode = options['flowcell_barcode'] library_folder = options[ 'library_folder'] if 'library_folder' in options else '{}/libraries'.format( output_folder) tmpdir = options[ 'temp_folder'] if 'temp_folder' in options else '{}/tmp'.format( output_folder) dropseq_folder = options[ 'dropseq_folder'] if 'dropseq_folder' in options else '/broad/macosko/bin/dropseq-tools' picard_folder = options[ 'picard_folder'] if 'picard_folder' in options else '/broad/macosko/bin/dropseq-tools/3rdParty/picard' scripts_folder = options[ 'scripts_folder'] if 'scripts_folder' in options else '/broad/macosko/jilong/slideseq_pipeline/scripts' is_NovaSeq = str2bool( options['is_NovaSeq']) if 'is_NovaSeq' in options else False is_NovaSeq_S4 = str2bool( options['is_NovaSeq_S4']) if 'is_NovaSeq_S4' in options else False num_slice_NovaSeq = int( options['num_slice_NovaSeq']) if 'num_slice_NovaSeq' in options else 10 num_slice_NovaSeq_S4 = int(options['num_slice_NovaSeq_S4'] ) if 'num_slice_NovaSeq_S4' in options else 40 email_address = options[ 'email_address'] if 'email_address' in options else '' basecalls_dir = '{}/Data/Intensities/BaseCalls'.format(flowcell_directory) log_file = '{}/logs/workflow.log'.format(output_folder) # Get read structure from RunInfo.xml runinfo_file = '{}/RunInfo.xml'.format(flowcell_directory) read_structure = get_read_structure(runinfo_file) # Parse metadata file write_log(log_file, flowcell_barcode, "Parse metadata file. ") commandStr = 'python ' + scripts_folder + '/parse_metadata.py -i ' + metadata_file + ' -r ' + runinfo_file + ' -o ' + '{}/parsed_metadata.txt'.format( output_folder) os.system(commandStr) # Read info from metadata file lanes = [] lanes_unique = [] libraries = [] libraries_unique = [] barcodes = [] bead_structures = [] references_unique = [] locus_function_list_unique = [] with open('{}/parsed_metadata.txt'.format(output_folder), 'r') as fin: reader = csv.reader(fin, delimiter='\t') rows = list(reader) row0 = rows[0] for i in range(1, len(rows)): row = rows[i] lanes.append(row[row0.index('lane')]) if row[row0.index('lane')] not in lanes_unique: lanes_unique.append(row[row0.index('lane')]) libraries.append(row[row0.index('library')]) if row[row0.index('library')] not in libraries_unique: libraries_unique.append(row[row0.index('library')]) references_unique.append(row[row0.index('reference')]) locus_function_list_unique.append( row[row0.index('locus_function_list')]) barcodes.append(row[row0.index('sample_barcode')]) bead_structures.append(row[row0.index('bead_structure')]) fin.close() # Get tile information from RunInfo.xml slice_id = {} slice_first_tile = {} slice_tile_limit = {} for lane in lanes_unique: tile_nums = get_tiles(runinfo_file, lane) tile_cou = len(tile_nums) if ((not is_NovaSeq) and (not is_NovaSeq_S4)): slice_id[lane] = ['0'] slice_first_tile[lane] = [str(tile_nums[0])] slice_tile_limit[lane] = [str(tile_cou)] else: slice_cou = num_slice_NovaSeq if is_NovaSeq else num_slice_NovaSeq_S4 tile_cou_per_slice = (tile_cou // slice_cou) + 1 slice_id[lane] = [] slice_first_tile[lane] = [] slice_tile_limit[lane] = [] for i in range(slice_cou): if (tile_cou_per_slice * i >= tile_cou): break slice_id[lane].append(str(i)) slice_first_tile[lane].append( str(tile_nums[tile_cou_per_slice * i])) slice_tile_limit[lane].append(str(tile_cou_per_slice)) folder_running = '{}/status/running.run_preparation'.format(output_folder) folder_finished = '{}/status/finished.run_preparation'.format( output_folder) folder_failed = '{}/status/failed.run_preparation'.format(output_folder) try: call(['mkdir', '-p', folder_running]) # Check if the input Illumina folder is in correct format commandStr = 'java -Djava.io.tmpdir=' + tmpdir + ' -XX:+UseParallelOldGC -XX:ParallelGCThreads=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8192m ' commandStr += '-jar ' + picard_folder + '/picard.jar CheckIlluminaDirectory TMP_DIR=' + tmpdir + ' VALIDATION_STRINGENCY=SILENT ' commandStr += 'BASECALLS_DIR=' + basecalls_dir + ' READ_STRUCTURE=' + read_structure if is_NovaSeq or is_NovaSeq_S4: commandStr += ' LINK_LOCS=false' for lane in lanes_unique: commandStr += ' L=' + lane write_log(log_file, flowcell_barcode, "CheckIlluminaDirectory Command=" + commandStr) os.system(commandStr) write_log(log_file, flowcell_barcode, "CheckIlluminaDirectory is done. ") # Create directories write_log(log_file, flowcell_barcode, "Creating directories. ") for lane in lanes_unique: call(['mkdir', '-p', '{}/{}'.format(output_folder, lane)]) call(['mkdir', '-p', '{}/{}/barcodes'.format(output_folder, lane)]) for slice in slice_id[lane]: call([ 'mkdir', '-p', '{}/{}/{}'.format(output_folder, lane, slice) ]) for i in range(len(lanes)): for slice in slice_id[lane]: if not os.path.isdir('{}/{}/{}/{}'.format( output_folder, lanes[i], slice, libraries[i])): call([ 'mkdir', '-p', '{}/{}/{}/{}'.format(output_folder, lanes[i], slice, libraries[i]) ]) if (barcodes[i]): call([ 'mkdir', '-p', '{}/{}/{}/{}/{}'.format(output_folder, lanes[i], slice, libraries[i], barcodes[i]) ]) # Generate barcode_params.txt that is needed by ExtractIlluminaBarcodes for lane in lanes_unique: write_log(log_file, flowcell_barcode, "Generating barcode_params.txt for Lane " + lane) commandStr = 'python ' + scripts_folder + '/gen_barcode_params.py -i ' + output_folder + '/parsed_metadata.txt -o ' + output_folder + '/' + lane + '/barcode_params.txt -l ' + lane os.system(commandStr) # Generate library_params that is needed by IlluminaBasecallsToSam for lane in lanes_unique: write_log(log_file, flowcell_barcode, "Generating library_params.txt for Lane " + lane) for slice in slice_id[lane]: commandStr = 'python ' + scripts_folder + '/gen_library_params.py -i ' + output_folder + '/parsed_metadata.txt -o ' + output_folder + '/' + lane + '/' + slice + '/library_params.txt -b ' commandStr += output_folder + '/' + lane + '/' + slice + '/ -n ' + flowcell_barcode + '.' + lane + '.' + slice + ' -l ' + lane os.system(commandStr) # Call run_processbarcodes for lane in lanes_unique: output_file = '{}/logs/run_processbarcodes_lane_{}.log'.format( output_folder, lane) submission_script = '{}/run_processbarcodes.sh'.format( scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=70g', '-notify', '-l', 'h_rt=06:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, lane, scripts_folder, output_folder, '{}/{}'.format(output_folder, lane) ] call_to_taskrunner(output_folder, call_args) # Call run_mergebarcodes output_file = '{}/logs/run_mergebarcodes.log'.format(output_folder) submission_script = '{}/run_mergebarcodes.sh'.format(scripts_folder) call_args = [ 'qsub', '-o', output_file, '-l', 'h_vmem=5G', '-notify', '-l', 'h_rt=100:0:0', '-j', 'y', '-P', 'macosko_lab', '-l', 'os=RedHat7', submission_script, manifest_file, scripts_folder, output_folder ] call_to_taskrunner(output_folder, call_args) call(['mv', folder_running, folder_finished]) except Exception as exp: print("EXCEPTION:!") print(exp) traceback.print_tb(exp.__traceback__, file=sys.stdout) if os.path.isdir(folder_running): call(['mv', folder_running, folder_failed]) else: call(['mkdir', '-p', folder_failed]) if len(email_address) > 1: subject = "Slide-seq workflow failed for " + flowcell_barcode content = "The Slide-seq workflow failed at the step of preparation. Please check the log file for the issues. " call_args = [ 'python', '{}/send_email.py'.format(scripts_folder), email_address, subject, content ] call(call_args) sys.exit()