def lambda_handler(event, context): # Log the received event bucket = event["Records"][0]["s3"]["bucket"]["name"] key = event["Records"][0]["s3"]["object"]["key"] prefix, batchAndPipe = key.split("pipelines/") image_prefix = prefix.split("workspace")[0] batch = batchAndPipe.split(pipeline_name)[0][:-1] # get the metadata file, so we can add stuff to it metadata_on_bucket_name = os.path.join(prefix, "metadata", batch, "metadata.json") metadata = helpful_functions.download_and_read_metadata_file( s3, bucket, metadata_file_name, metadata_on_bucket_name) num_series = int(metadata["barcoding_rows"]) * int( metadata["barcoding_columns"]) if "barcoding_imperwell" in list(metadata.keys()): if metadata["barcoding_imperwell"] != "": if int(metadata["barcoding_imperwell"]) != 0: num_series = int(metadata["barcoding_imperwell"]) expected_cycles = int(metadata["barcoding_cycles"]) # Get the list of images in this experiment - this can take a long time for big experiments so let's add some prints print("Getting the list of images") image_list_prefix = ( image_prefix + batch + "/images/" ) # the slash here is critical, because we don't want to read images_corrected because it's huge image_list = helpful_functions.paginate_a_folder(s3, bucket, image_list_prefix) print("Image list retrieved") image_dict = helpful_functions.parse_image_names(image_list, filter_in="10X", filter_out="copy") metadata["barcoding_file_data"] = image_dict print("Parsing the image list") # We've saved the previous for looking at/debugging later, but really all we want is the ones with all cycles if metadata["one_or_many_files"] == 1: parsed_image_dict = helpful_functions.return_full_wells( image_dict, expected_cycles, metadata["one_or_many_files"]) else: parsed_image_dict = helpful_functions.return_full_wells( image_dict, expected_cycles, metadata["one_or_many_files"], files_per_well=num_series, ) metadata["wells_with_all_cycles"] = parsed_image_dict helpful_functions.write_metadata_file(s3, bucket, metadata, metadata_file_name, metadata_on_bucket_name) # Pull the file names we care about, and make the CSV print("Making the CSVs") platelist = list(image_dict.keys()) for eachplate in platelist: platedict = parsed_image_dict[eachplate] well_list = list(platedict.keys()) bucket_folder = ("/home/ubuntu/bucket/" + image_prefix + batch + "/images/" + eachplate) per_plate_csv = create_CSVs.create_CSV_pipeline5( eachplate, num_series, expected_cycles, bucket_folder, platedict, metadata["one_or_many_files"], metadata["fast_or_slow_mode"], ) csv_on_bucket_name = (prefix + "load_data_csv/" + batch + "/" + eachplate + "/load_data_pipeline5.csv") with open(per_plate_csv, "rb") as a: s3.put_object(Body=a, Bucket=bucket, Key=csv_on_bucket_name) # Now it's time to run DCP # Replacement for 'fab setup' app_name = run_DCP.run_setup(bucket, prefix, batch, step) # run_DCP.grab_batch_config(bucket,prefix,batch,step) # Make a batch create_batch_jobs.create_batch_jobs_5(image_prefix, batch, pipeline_name, platelist, expected_cycles, app_name) # Start a cluster run_DCP.run_cluster(bucket, prefix, batch, step, fleet_file_name, len(platelist) * expected_cycles) # Run the monitor run_DCP.run_monitor(bucket, prefix, batch, step) print("Go run the monitor now")
def lambda_handler(event, context): # Log the received event bucket = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] prefix, batchAndPipe = key.split('pipelines/') image_prefix = prefix.split('workspace')[0] batch = batchAndPipe.split(pipeline_name)[0][:-1] #get the metadata file, so we can add stuff to it metadata_on_bucket_name = os.path.join(prefix, 'metadata', batch, 'metadata.json') metadata = helpful_functions.download_and_read_metadata_file( s3, bucket, metadata_file_name, metadata_on_bucket_name) num_series = int(metadata['painting_rows']) * int( metadata['painting_columns']) if "painting_imperwell" in metadata.keys(): if metadata["painting_imperwell"] != "": if int(metadata["painting_imperwell"]) != 0: num_series = int(metadata["painting_imperwell"]) #Get the list of images in this experiment image_list_prefix = image_prefix + batch + '/images/' image_list = helpful_functions.paginate_a_folder(s3, bucket, image_list_prefix) image_dict = helpful_functions.parse_image_names(image_list, filter_in='20X', filter_out='copy') metadata['painting_file_data'] = image_dict helpful_functions.write_metadata_file(s3, bucket, metadata, metadata_file_name, metadata_on_bucket_name) if metadata['one_or_many_files'] == 'one': full_well_files = 1 else: full_well_files = num_series #Pull the file names we care about, and make the CSV platelist = image_dict.keys() for eachplate in platelist: platedict = image_dict[eachplate] well_list = platedict.keys() paint_cycle_name = platedict[well_list[0]].keys()[0] per_well_im_list = [] for eachwell in well_list: per_well = platedict[eachwell][paint_cycle_name] per_well.sort() if len(per_well) == full_well_files: per_well_im_list.append(per_well) bucket_folder = '/home/ubuntu/bucket/' + image_prefix + batch + '/images/' + eachplate + '/' + paint_cycle_name per_plate_csv = create_CSVs.create_CSV_pipeline1( eachplate, num_series, bucket_folder, per_well_im_list, metadata['one_or_many_files']) csv_on_bucket_name = prefix + 'load_data_csv/' + batch + '/' + eachplate + '/load_data_pipeline1.csv' with open(per_plate_csv, 'rb') as a: s3.put_object(Body=a, Bucket=bucket, Key=csv_on_bucket_name) #Now it's time to run DCP #Replacement for 'fab setup' app_name = run_DCP.run_setup(bucket, prefix, batch, step) #run_DCP.grab_batch_config(bucket,prefix,batch,step) #Make a batch create_batch_jobs.create_batch_jobs_1(image_prefix, batch, pipeline_name, platelist, app_name) #Start a cluster run_DCP.run_cluster(bucket, prefix, batch, step, fleet_file_name, len(platelist)) #Run the monitor run_DCP.run_monitor(bucket, prefix, batch, step) print('Go run the monitor now')
def lambda_handler(event, context): bucket_name = event["Records"][0]["s3"]["bucket"]["name"] key = event["Records"][0]["s3"]["object"]["key"] if "csv" in key: plate = key.split("/")[-2].split("-")[0] batch = key.split("/")[-5] image_prefix = key.split(batch)[0] else: batch = key.split("/")[-2] image_prefix = key.split("workspace")[0] prefix = os.path.join(image_prefix, "workspace/") print(batch, prefix) # get the metadata file, so we can add stuff to it metadata_on_bucket_name = os.path.join(prefix, "metadata", batch, "metadata.json") print("Loading", metadata_on_bucket_name) metadata = helpful_functions.download_and_read_metadata_file( s3, bucket_name, metadata_file_name, metadata_on_bucket_name) image_dict = metadata["painting_file_data"] num_series = int(metadata["painting_rows"]) * int( metadata["painting_columns"]) if "painting_imperwell" in list(metadata.keys()): if metadata["painting_imperwell"] != "": if int(metadata["painting_imperwell"]) != 0: num_series = int(metadata["painting_imperwell"]) out_range = list(range(0, num_series, range_skip)) expected_files_per_well = (num_series * int(metadata["painting_channels"])) + 6 platelist = list(image_dict.keys()) plate_and_well_list = [] for eachplate in platelist: platedict = image_dict[eachplate] well_list = list(platedict.keys()) for eachwell in well_list: plate_and_well_list.append((eachplate, eachwell)) metadata["painting_plate_and_well_list"] = plate_and_well_list helpful_functions.write_metadata_file(s3, bucket_name, metadata, metadata_file_name, metadata_on_bucket_name) # First let's check if it seems like the whole thing is done or not sqs = boto3.client("sqs") filter_prefix = image_prefix + batch + "/images_corrected/painting" # Expected length shows that all transfers (i.e. all wells) have at least started expected_len = ( (len(plate_and_well_list) - 1) * expected_files_per_well) + 1 print("Checking if all files are present") done = helpful_functions.check_if_run_done( s3, bucket_name, filter_prefix, expected_len, current_app_name, prev_step_app_name, sqs, duplicate_queue_name, ) if not done: print("Still work ongoing") return "Still work ongoing" else: print("Checking CSVs for thresholds") image_csv_list = helpful_functions.paginate_a_folder( s3, bucket_name, os.path.join(image_prefix, batch, "images_corrected/painting"), ) image_csv_list = [x for x in image_csv_list if "Image.csv" in x] image_df = helpful_functions.concat_some_csvs(s3, bucket_name, image_csv_list, "Image.csv") threshes = image_df["Threshold_FinalThreshold_Cells"] calc_upper_percentile = numpy.percentile(threshes, upper_percentile) print( "In ", len(image_csv_list) * num_series, f"images, the {upper_percentile} percentile was", calc_upper_percentile, ) calc_lower_percentile = numpy.percentile(threshes, lower_percentile) print( "In ", len(image_csv_list) * num_series, f"images, the {lower_percentile} percentile was", calc_lower_percentile, ) pipeline_on_bucket_name = os.path.join(prefix, "pipelines", batch, pipeline_name) local_pipeline_name = os.path.join("/tmp", pipeline_name) local_temp_pipeline_name = os.path.join( "/tmp", pipeline_name.split(".")[0] + "_edited.cppipe") with open(local_pipeline_name, "wb") as f: s3.download_fileobj(bucket_name, pipeline_on_bucket_name, f) edit_id_secondary(local_pipeline_name, local_temp_pipeline_name, calc_lower_percentile, calc_upper_percentile) with open(local_temp_pipeline_name, "rb") as pipeline: s3.put_object(Body=pipeline, Bucket=bucket_name, Key=pipeline_on_bucket_name) print("Edited pipeline file") # Pull the file names we care about, and make the CSV for eachplate in platelist: platedict = image_dict[eachplate] well_list = list(platedict.keys()) bucket_folder = ("/home/ubuntu/bucket/" + image_prefix + batch + "/images_corrected/painting") per_plate_csv = create_CSVs.create_CSV_pipeline3( eachplate, num_series, bucket_folder, well_list, range_skip) csv_on_bucket_name = (prefix + "load_data_csv/" + batch + "/" + eachplate + "/load_data_pipeline3.csv") print("Created", csv_on_bucket_name) with open(per_plate_csv, "rb") as a: s3.put_object(Body=a, Bucket=bucket_name, Key=csv_on_bucket_name) # now let's do our stuff! app_name = run_DCP.run_setup(bucket_name, prefix, batch, step) # make the jobs create_batch_jobs.create_batch_jobs_3(image_prefix, batch, pipeline_name, plate_and_well_list, out_range, app_name) # Start a cluster run_DCP.run_cluster( bucket_name, prefix, batch, step, fleet_file_name, len(plate_and_well_list) * len(out_range), ) # Run the monitor run_DCP.run_monitor(bucket_name, prefix, batch, step) print("Go run the monitor now") return "Cluster started"
def lambda_handler(event, context): # Log the received event bucket = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] prefix, batchAndPipe = key.split('pipelines/') image_prefix = prefix.split('workspace')[0] batch = batchAndPipe.split(pipeline_name)[0][:-1] #get the metadata file, so we can add stuff to it metadata_on_bucket_name = os.path.join(prefix,'metadata',batch,'metadata.json') metadata = helpful_functions.download_and_read_metadata_file(s3, bucket, metadata_file_name, metadata_on_bucket_name) num_series = int(metadata['barcoding_rows']) * int(metadata['barcoding_columns']) if "barcoding_imperwell" in metadata.keys(): if metadata["barcoding_imperwell"] != "": if int(metadata["barcoding_imperwell"]) != 0: num_series = int(metadata["barcoding_imperwell"]) expected_cycles = int(metadata['barcoding_cycles']) #Get the list of images in this experiment - this can take a long time for big experiments so let's add some prints print('Getting the list of images') image_list_prefix = image_prefix+batch+'/images/' #the slash here is critical, because we don't want to read images_corrected because it's huge image_list = helpful_functions.paginate_a_folder(s3,bucket,image_list_prefix) print('Image list retrieved') image_dict = helpful_functions.parse_image_names(image_list, filter_in = '10X', filter_out = 'copy') metadata ['barcoding_file_data'] = image_dict print('Parsing the image list') #We've saved the previous for looking at/debugging later, but really all we want is the ones with all cycles if metadata['one_or_many_files'] == 1: parsed_image_dict = helpful_functions.return_full_wells(image_dict,expected_cycles, metadata['one_or_many_files']) else: parsed_image_dict = helpful_functions.return_full_wells(image_dict,expected_cycles, metadata['one_or_many_files'], files_per_well=num_series) metadata['wells_with_all_cycles'] = parsed_image_dict helpful_functions.write_metadata_file(s3, bucket, metadata, metadata_file_name, metadata_on_bucket_name) #Pull the file names we care about, and make the CSV print('Making the CSVs') platelist = image_dict.keys() for eachplate in platelist: platedict = parsed_image_dict[eachplate] well_list = platedict.keys() bucket_folder = '/home/ubuntu/bucket/'+image_prefix+batch+'/images/'+eachplate per_plate_csv = create_CSVs.create_CSV_pipeline5(eachplate, num_series, expected_cycles, bucket_folder, platedict, metadata['one_or_many_files'], metadata["fast_or_slow_mode"]) csv_on_bucket_name = prefix + 'load_data_csv/'+batch+'/'+eachplate+'/load_data_pipeline5.csv' with open(per_plate_csv,'rb') as a: s3.put_object(Body= a, Bucket = bucket, Key = csv_on_bucket_name ) #Now it's time to run DCP #Replacement for 'fab setup' app_name = run_DCP.run_setup(bucket,prefix,batch,step) #run_DCP.grab_batch_config(bucket,prefix,batch,step) #Make a batch create_batch_jobs.create_batch_jobs_5(image_prefix,batch,pipeline_name,platelist, expected_cycles, app_name) #Start a cluster run_DCP.run_cluster(bucket,prefix,batch,step, fleet_file_name, len(platelist)*expected_cycles) #Run the monitor run_DCP.run_monitor(bucket, prefix, batch,step) print('Go run the monitor now')
def lambda_handler(event, context): bucket = event["Records"][0]["s3"]["bucket"]["name"] key = event["Records"][0]["s3"]["object"]["key"] prefix, batchAndPipe = key.split("pipelines/") image_prefix = prefix.split("workspace")[0] batch = batchAndPipe.split("1_")[0][:-1] # Get the metadata file metadata_on_bucket_name = os.path.join(prefix, "metadata", batch, "metadata.json") metadata = helpful_functions.download_and_read_metadata_file( s3, bucket, metadata_file_name, metadata_on_bucket_name) # Standard vs. SABER configs if "Channeldict" not in list(metadata.keys()): print("Update your metadata.json to include Channeldict") return "Update your metadata.json to include Channeldict" Channeldict = ast.literal_eval(metadata["Channeldict"]) if len(Channeldict.keys()) == 1: SABER = False print("Not a SABER experiment") if len(Channeldict.keys()) > 1: SABER = True print("SABER experiment") # Calculate number of images from rows and columns in metadata num_series = int(metadata["painting_rows"]) * int( metadata["painting_columns"]) # Overwrite rows x columns number series if images per well set in metadata if "painting_imperwell" in list(metadata.keys()): if metadata["painting_imperwell"] != "": if int(metadata["painting_imperwell"]) != 0: num_series = int(metadata["painting_imperwell"]) # Get the list of images in this experiment if not SABER: parse_name_filter = "20X_CP_" if SABER: parse_name_filter = "" image_list_prefix = image_prefix + batch + "/images/" image_list = helpful_functions.paginate_a_folder(s3, bucket, image_list_prefix) image_dict = helpful_functions.parse_image_names( image_list, filter_in=parse_name_filter, filter_out="copy") metadata["painting_file_data"] = image_dict helpful_functions.write_metadata_file(s3, bucket, metadata, metadata_file_name, metadata_on_bucket_name) # How many files/well indicates the well has all images present if metadata["one_or_many_files"] == "one": full_well_files = 1 else: full_well_files = num_series # Pull the file names we care about, and make the CSV platelist = list(image_dict.keys()) for eachplate in platelist: platedict = image_dict[eachplate] well_list = list(platedict.keys()) Channelrounds = list(Channeldict.keys()) # Only keep full wells print( f"{full_well_files} expect files per well and round for {eachplate}" ) incomplete_wells = [] for eachwell in well_list: for eachround in Channelrounds: per_well = platedict[eachwell][eachround] if len(per_well) != full_well_files: incomplete_wells.append(eachwell) print( f"{eachwell} {eachround} doesn't have full well files. {len(per_well)} files found." ) if incomplete_wells: for well in incomplete_wells: del platedict[well] bucket_folder = ("/home/ubuntu/bucket/" + image_prefix + batch + "/images/" + eachplate + "/") illum_folder = ("/home/ubuntu/bucket/" + image_prefix + batch + "/illum/" + eachplate) per_plate_csv, per_plate_csv_2 = create_CSVs.create_CSV_pipeline1( eachplate, num_series, bucket_folder, illum_folder, platedict, metadata["one_or_many_files"], metadata["Channeldict"], ) csv_on_bucket_name = (prefix + "load_data_csv/" + batch + "/" + eachplate + "/load_data_pipeline1.csv") csv_on_bucket_name_2 = (prefix + "load_data_csv/" + batch + "/" + eachplate + "/load_data_pipeline2.csv") with open(per_plate_csv, "rb") as a: s3.put_object(Body=a, Bucket=bucket, Key=csv_on_bucket_name) with open(per_plate_csv_2, "rb") as a: s3.put_object(Body=a, Bucket=bucket, Key=csv_on_bucket_name_2) # Now it's time to run DCP app_name = run_DCP.run_setup(bucket, prefix, batch, step) # Make a batch if not SABER: pipeline_name = "1_CP_Illum.cppipe" if SABER: pipeline_name = "1_SABER_CP_Illum.cppipe" create_batch_jobs.create_batch_jobs_1(image_prefix, batch, pipeline_name, platelist, app_name) # Start a cluster run_DCP.run_cluster(bucket, prefix, batch, step, fleet_file_name, len(platelist)) # Run the monitor run_DCP.run_monitor(bucket, prefix, batch, step) print("Go run the monitor now")
def lambda_handler(event, context): bucket_name = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] if 'csv' in key: plate = key.split('/')[-2].split('-')[0] batch = key.split('/')[-5] image_prefix = key.split(batch)[0] else: batch = key.split('/')[-2] image_prefix = key.split('workspace')[0] prefix = os.path.join(image_prefix, 'workspace/') print(batch, prefix) # Get the metadata file, so we can add stuff to it metadata_on_bucket_name = os.path.join(prefix, 'metadata', batch, 'metadata.json') print('Loading', metadata_on_bucket_name) metadata = helpful_functions.download_and_read_metadata_file( s3, bucket_name, metadata_file_name, metadata_on_bucket_name) image_dict = metadata['painting_file_data'] num_series = int(metadata['painting_rows']) * int( metadata['painting_columns']) if "painting_imperwell" in metadata.keys(): if metadata["painting_imperwell"] != "": if int(metadata["painting_imperwell"]) != 0: num_series = int(metadata["painting_imperwell"]) out_range = range(0, num_series, range_skip) expected_files_per_well = (num_series * 6) platelist = image_dict.keys() plate_and_well_list = metadata['painting_plate_and_well_list'] # First let's check if 3A is done filter_prefix = image_prefix + batch + '/images_segmentation/segment_troubleshoot' expected_len = (len(plate_and_well_list) * expected_files_per_well) print('Checking if all files are present') done = helpful_functions.check_if_run_done(s3, bucket_name, filter_prefix, expected_len, current_app_name, prev_step_app_name, sqs, duplicate_queue_name) if not done: print('Still work ongoing') return ('Still work ongoing') else: print("Checking CSVs for what the upper threshold should be") image_csv_list = helpful_functions.paginate_a_folder( s3, bucket_name, os.path.join(image_prefix, batch, 'images_segmentation/troubleshoot')) image_csv_list = [x for x in image_csv_list if 'Image.csv' in x] image_df = helpful_functions.concat_some_csvs(s3, bucket_name, image_csv_list, 'Image.csv') threshes = image_df['Threshold_FinalThreshold_Cells'] percentile = numpy.percentile(threshes, 90) print("In ", len(image_csv_list) * num_series, "images, the 90th percentile was", percentile) pipeline_on_bucket_name = os.path.join(prefix, 'pipelines', batch, pipeline_name) local_pipeline_name = os.path.join('/tmp', pipeline_name) local_temp_pipeline_name = os.path.join( '/tmp', pipeline_name.split('.')[0] + '_edited.cppipe') with open(local_pipeline_name, 'wb') as f: s3.download_fileobj(bucket_name, pipeline_on_bucket_name, f) edit_id_secondary(local_pipeline_name, local_temp_pipeline_name, percentile) with open(local_temp_pipeline_name, 'rb') as pipeline: s3.put_object(Body=pipeline, Bucket=bucket_name, Key=pipeline_on_bucket_name) print('Edited pipeline file') # Pull the file names we care about, and make the CSV for eachplate in platelist: platedict = image_dict[eachplate] well_list = platedict.keys() bucket_folder = '/home/ubuntu/bucket/' + image_prefix + batch + '/images_corrected/painting' per_plate_csv = create_CSVs.create_CSV_pipeline3( eachplate, num_series, bucket_folder, well_list, range_skip) csv_on_bucket_name = prefix + 'load_data_csv/' + batch + '/' + eachplate + '/load_data_pipeline3B.csv' print('Created', csv_on_bucket_name) with open(per_plate_csv, 'rb') as a: s3.put_object(Body=a, Bucket=bucket_name, Key=csv_on_bucket_name) # Now let's do our stuff! app_name = run_DCP.run_setup(bucket_name, prefix, batch, step) print('app_name is', app_name) # Make the jobs create_batch_jobs.create_batch_jobs_3B(image_prefix, batch, pipeline_name, plate_and_well_list, out_range, app_name) # Start a cluster run_DCP.run_cluster(bucket_name, prefix, batch, config_step, fleet_file_name, len(plate_and_well_list) * len(out_range)) # Create the monitor run_DCP.run_monitor(bucket_name, prefix, batch, step) print('Go run the monitor now') return ('Cluster started')