def test_no_sample_name(self): """ Throws an exception if the Data portion has no Sample_Name column. """ stub_sample_sheet = """ [Header] IEMFileVersion,3 Investigator Name,RL Project Name,11-Jul-2014_nosamplenametest Experiment Name,11-Jul-2014_nosamplenametest Date,07/11/2014 Workflow,GenerateFASTQ Assay,Nextera Description,Nextera Chemistry,Amplicon [Reads] 251 251 [Settings] [Data] There,Is,No,Sample,Name A,B,C,D,E """ with self.assertRaises(ValueError) as assertion: sample_sheet_parser(StringIO(stub_sample_sheet)) self.assertEqual("sample sheet data header does not include Sample_Name", assertion.exception.args[0])
def process_run(self, run_folder: Path): if not (run_folder / 'needsprocessing').exists(): return False if (run_folder / 'errorprocessing').exists(): return True sample_sheet_path = run_folder / 'SampleSheet.csv' folder_name = run_folder.name with sample_sheet_path.open() as f: try: run_info = sample_sheet_parser(f) except Exception: raise RuntimeError(f'Failed to process run {folder_name}.') project_groups = defaultdict(list) for sample_info in run_info['DataSplit']: sample_number = sample_info['sample_number'] project = sample_info['project'] project_groups[sample_number].append(project) for project_names in project_groups.values(): project_names.sort() name_tuple = tuple(project_names) self.project_counts[name_tuple] += 1 self.latest_dates[name_tuple] = max(self.latest_dates[name_tuple], folder_name) top_projects = self.project_counts.most_common(3) summary = ', '.join(f'({", ".join(project_codes)}): {count}' for project_codes, count in top_projects) logger.debug('After %s, top counts are: %s', folder_name, summary) return True
def find_groups(file_names, sample_sheet_path, included_projects=None): """ Group HCV samples with their MIDI partners. :param list[str] file_names: a list of FASTQ file names without paths :param sample_sheet_path: path to the SampleSheet.csv file :param included_projects: project codes to include, or None to include all """ with open(sample_sheet_path) as sample_sheet_file: run_info = sample_sheet_parser(sample_sheet_file) midi_files = { row['sample']: row['filename'] for row in run_info['DataSplit'] if row['project'] == 'MidHCV' } wide_names = { row['filename']: row['sample'] for row in run_info['DataSplit'] if (row['project'] != 'MidHCV' and ( included_projects is None or row['project'] in included_projects)) } trimmed_names = { '_'.join(file_name.split('_')[:2]): file_name for file_name in file_names } for trimmed_name, file_name in sorted(trimmed_names.items()): sample_name = wide_names.get(trimmed_name) if sample_name is None: # Project was not included. continue midi_trimmed = midi_files.get(sample_name + 'MIDI') midi_name = trimmed_names.get(midi_trimmed) yield SampleGroup(sample_name, (file_name, midi_name))
def process_folder(result_folder, qai_server, qai_user, qai_password, pipeline_version): logger.info('Uploading data to Oracle from {}'.format(result_folder)) collated_conseqs = os.path.join(result_folder, 'conseq.csv') collated_counts = os.path.join(result_folder, 'remap_counts.csv') cascade = os.path.join(result_folder, 'cascade.csv') coverage_scores = os.path.join(result_folder, 'coverage_scores.csv') all_results_path, _ = os.path.split(os.path.normpath(result_folder)) run_path, _ = os.path.split(all_results_path) sample_sheet_file = os.path.join(run_path, "SampleSheet.csv") with open(sample_sheet_file, "rU") as f: sample_sheet = sample_sheet_parser.sample_sheet_parser(f) ok_sample_regions = load_ok_sample_regions(result_folder) with qai_helper.Session() as session: session.login(qai_server, qai_user, qai_password) run = find_run(session, sample_sheet["Experiment Name"]) with open(collated_conseqs, "rU") as f: conseqs = build_conseqs(f, run, sample_sheet, ok_sample_regions) with open(coverage_scores, "rU") as f, \ open(collated_counts, "rU") as f2, \ open(cascade, "rU") as f3: upload_review_to_qai(f, f2, f3, run, sample_sheet, conseqs, session, pipeline_version)
def process_run(run_folder: Path, skip_mid_hcv: bool): if not (run_folder / 'needsprocessing').exists(): return False if (run_folder / 'errorprocessing').exists(): return True sample_sheet_path = run_folder / 'SampleSheet.csv' with sample_sheet_path.open() as f: try: run_info = sample_sheet_parser(f) except Exception: raise RuntimeError(f'Failed to process run {run_folder.name}.') sample_names = set(run_info['Data']) if skip_mid_hcv: sample_names = { sample_name for sample_name in sample_names if not re.match(r'.*MidHCV_S\d+$', sample_name) } cascade_path = run_folder / 'Results' / 'version_7.9' / 'cascade.csv' with cascade_path.open() as f: reader = DictReader(f) cascade_samples = {row['sample'] for row in reader} missing_samples = sample_names - cascade_samples if missing_samples: logger.error('Missing samples in run %s: %s', run_folder.name, sorted(missing_samples)) return True
def test_no_index2(self): """ Throws an exception if the Data portion has no Sample_Name column. """ stub_sample_sheet = """ [Header] IEMFileVersion,3 Investigator Name,RL Project Name,11-Jul-2014_nosamplenametest Experiment Name,11-Jul-2014_nosamplenametest Date,07/11/2014 Workflow,GenerateFASTQ Assay,Nextera Description,Nextera Chemistry,Amplicon [Reads] 251 251 [Settings] [Data] Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,Sample_Project,Description,GenomeFolder CFE_SomeId_10-Jul-2014_N501_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,\ 10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \ Disablecontamcheck:Sample1_Proj1:FALSE, CFE_SomeId_10-Jul-2014_N501_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,\ 10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \ Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE, """ ss = sample_sheet_parser(StringIO(stub_sample_sheet)) sample = ss['Data']['Sample1-Proj1_S1'] self.assertEqual('ACGTACGT', sample['index1']) self.assertEqual('X', sample['index2']) self.assertEqual('N501-X', sample['tags'])
def test_extra_commas(self): """ Throws an exception if the Data portion has no Sample_Name column. """ stub_sample_sheet = """ [Header],,,,,,, IEMFileVersion,3,,,,,,, Investigator Name,RL,,,,,,, Project Name,10-Jul-2014,,,,,,, Experiment Name,10-Jul-2014,,,,,,, Date,07/10/2014,,,,,,, Workflow,GenerateFASTQ,,,,,,, Assay,Nextera,,,,,,, Description,Nextera,,,,,,, Chemistry,Amplicon,,,,,,, [Reads],,,,,,, 251,,,,,,, 251,,,,,,, [Settings],,,,,,, [Data],,,,,,, Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\ 10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \ Disablecontamcheck:Sample1_Proj1:FALSE, CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\ 10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \ Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE, """ ss = sample_sheet_parser(StringIO(stub_sample_sheet)) self.assertEqual(ss["Experiment Name"], "10-Jul-2014")
def find_groups(file_names, sample_sheet_path, included_projects=None): """ Group HCV samples with their MIDI partners. :param list[str] file_names: a list of FASTQ file names without paths :param sample_sheet_path: path to the SampleSheet.csv file :param included_projects: project codes to include, or None to include all """ with open(sample_sheet_path) as sample_sheet_file: run_info = sample_sheet_parser(sample_sheet_file) overrides_path = Path(sample_sheet_path).parent / 'SampleSheetOverrides.csv' if overrides_path.exists(): with overrides_path.open() as overrides_file: read_sample_sheet_overrides(overrides_file, run_info) midi_hcv_code = 'MidHCV' midi_files = {row['sample']: row['filename'] for row in run_info['DataSplit'] if row['project'] == midi_hcv_code} wide_names = {row['filename']: (row['sample'], row['project']) for row in run_info['DataSplit'] if (row['project'] != midi_hcv_code and (included_projects is None or row['project'] in included_projects))} trimmed_names = {'_'.join(file_name.split('_')[:2]): file_name for file_name in file_names} unused_names = set(trimmed_names.values()) for trimmed_name, file_name in sorted(trimmed_names.items()): sample_entry = wide_names.get(trimmed_name) if sample_entry is None: # Project was not included. continue sample_name, project_code = sample_entry midi_trimmed = midi_files.get(sample_name + 'MIDI') if midi_trimmed is None and sample_name.upper().endswith('WG'): sample_name = sample_name[:-2] midi_trimmed = midi_files.get(sample_name + 'MIDI') midi_name = trimmed_names.get(midi_trimmed) unused_names.discard(file_name) unused_names.discard(midi_name) midi_project = midi_name and midi_hcv_code yield SampleGroup(sample_name, (file_name, midi_name), (project_code, midi_project)) if unused_names: sample_names = {file_name: sample_name for sample_name, file_name in midi_files.items()} for trimmed_name, file_name in sorted(trimmed_names.items()): if file_name in unused_names: unused_names.discard(file_name) sample_name = sample_names.get(trimmed_name) if sample_name is not None: yield SampleGroup(sample_name, (file_name, None), (midi_hcv_code, None))
def test_read_sample_sheet_overrides(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' overrides_path = sample_sheet_path.parent / 'SampleSheetOverrides.csv' sample_sheet_path.write_text("""\ [Header] IEMFileVersion,3 Investigator Name,RL Project Name,10-Jul-2014_v1test Experiment Name,10-Jul-2014_v1test Date,07/10/2014 Workflow,GenerateFASTQ Assay,Nextera Description,Nextera Chemistry,Amplicon [Reads] 251 251 [Settings] [Data] Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\ 10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \ Disablecontamcheck:Sample1_Proj1:FALSE, CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\ 10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \ Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE, """) overrides_path.write_text("""\ sample,project Sample2-Proj2_S2,AltB """) with sample_sheet_path.open() as f: run_info = sample_sheet_parser(f) with overrides_path.open() as f: read_sample_sheet_overrides(f, run_info) sample_map = run_info['Data'] assert len(sample_map) == 2 split_rows = run_info['DataSplit'] assert len(split_rows) == 2 assert split_rows[0]['project'] == 'Proj1' assert split_rows[1]['project'] == 'AltB'
def find(self, source_folder, qai_run_names=None): """ Find matching samples in the source folder. Puts all the sample names in self.fastq_paths. :param str source_folder: the folder to search for samples that match self.extract_num :param set qai_run_names: a set to add the new run name to as it would be formatted on QAI. """ run_path = os.path.join(source_folder, 'MiSeq', 'runs', self.run_name) if os.path.exists(run_path): self.run_name = run_path else: run_parts = str(self.run_name).split('.') if len(run_parts) < 2: run_parts.append('M01841') format_string = '%d-%b-%y' if len( run_parts[0]) == 9 else '%d-%b-%Y' run_date = datetime.strptime(run_parts[0], format_string) base_run_name = run_date.strftime('%y%m%d') + '_' + run_parts[1] pattern = os.path.join(source_folder, 'MiSeq', 'runs', base_run_name + '*', NEEDS_PROCESSING) matches = glob(pattern) if len(matches) != 1: raise RuntimeError( 'Expected one match for {}, but found: {}'.format( pattern, matches)) self.run_name = os.path.dirname(matches[0]) pattern = os.path.join(self.run_name, 'Data', 'Intensities', 'BaseCalls', self.extract_num + '*_R1_*') matches = glob(pattern) if len(matches) == 0: raise RuntimeError('No matches found for ' + pattern) matches.sort() self.fastq_paths = matches if qai_run_names is not None: sample_sheet_path = os.path.join(self.run_name, 'SampleSheet.csv') with open(sample_sheet_path) as f: try: sample_sheet = sample_sheet_parser(f) except ValueError: print(f'Bad sample sheet for {self.run_name}.') else: qai_run_name = sample_sheet['Project Name'] qai_run_names.add(qai_run_name)
def find_groups(file_names, sample_sheet_path, included_projects=None): """ Group HCV samples with their MIDI partners. :param list[str] file_names: a list of FASTQ file names without paths :param sample_sheet_path: path to the SampleSheet.csv file :param included_projects: project codes to include, or None to include all """ with open(sample_sheet_path) as sample_sheet_file: run_info = sample_sheet_parser(sample_sheet_file) midi_files = {row['sample']: row['filename'] for row in run_info['DataSplit'] if row['project'] == 'MidHCV'} wide_names = {row['filename']: row['sample'] for row in run_info['DataSplit'] if (row['project'] != 'MidHCV' and (included_projects is None or row['project'] in included_projects))} trimmed_names = {'_'.join(file_name.split('_')[:2]): file_name for file_name in file_names} unused_names = set(trimmed_names.values()) for trimmed_name, file_name in sorted(trimmed_names.items()): sample_name = wide_names.get(trimmed_name) if sample_name is None: # Project was not included. continue midi_trimmed = midi_files.get(sample_name + 'MIDI') if midi_trimmed is None and sample_name.upper().endswith('WG'): sample_name = sample_name[:-2] midi_trimmed = midi_files.get(sample_name + 'MIDI') midi_name = trimmed_names.get(midi_trimmed) unused_names.discard(file_name) unused_names.discard(midi_name) yield SampleGroup(sample_name, (file_name, midi_name)) if unused_names: sample_names = {file_name: sample_name for sample_name, file_name in midi_files.items()} for trimmed_name, file_name in sorted(trimmed_names.items()): if file_name in unused_names: unused_names.discard(file_name) sample_name = sample_names.get(trimmed_name) if sample_name is not None: yield SampleGroup(sample_name, (file_name, None))
def setUp(self): self.maxDiff = None self.ss = sample_sheet_parser(StringIO.StringIO(self.stub_sample_sheet))
def setUp(self): self.ss = sample_sheet_parser(StringIO(self.stub_sample_sheet))
def main(): comm = MPI.COMM_WORLD # @UndefinedVariable process_rank = comm.Get_rank() process_count = comm.Get_size() args = parseOptions(comm) log_file = "{}/pipeline{}.log".format(args.run_folder, process_rank) logger = miseq_logging.init_logging(log_file, file_log_level=logging.DEBUG, console_log_level=logging.INFO) logger.info('Start processing run %s, rank %d', args.run_folder, process_rank) if args.mode is not None: run_info = None else: with open(args.run_folder+'/SampleSheet.csv', 'rU') as sample_sheet: logger.debug("sample_sheet_parser({})".format(sample_sheet)) run_info = sample_sheet_parser(sample_sheet) args.mode = run_info['Description'] fastq_samples = [] fastq_files = glob(args.run_folder + '/*_R1_001.fastq') for i, fastq in enumerate(fastq_files): if i % process_count != process_rank: # skip samples that are assigned to other worker processes continue sample_info = SampleInfo(fastq) # verify this sample is in SampleSheet.csv if run_info and sample_info.key not in run_info['Data']: logger.error( '{} not in SampleSheet.csv - cannot map this sample'.format( sample_info.key)) continue fastq_samples.append(sample_info) def launch_callback(command): logger.info("Launching {!r}".format(command)) worker = Worker(launch_callback=launch_callback, working_path=args.run_folder, are_temp_folders_deleted=are_temp_folders_deleted, logger=logger) if args.phase in ('filter', 'all'): filter_quality(args.run_folder, worker) if args.phase in ('mapping', 'all'): map_samples(args.run_folder, fastq_samples, worker) if args.phase in ('counting', 'all'): count_samples(fastq_samples, worker, args) if args.phase in ('summarizing', 'all') and process_rank == 0: collate_results(fastq_samples, worker, args, logger) # FIXME: this log message gets sent before workers start logger.info('Finish processing run %s, rank %d', args.run_folder, process_rank)