def demultiplex_fastq(outdir, samplesheet, fastq1, fastq2=None): """Demultiplex a bcl-converted illumina fastq file. Assumes it has the index sequence in the header a la CASAVA 1.8+ """ outfiles = {} counts = {} sdata = HiSeqRun.parse_samplesheet(samplesheet) reads = [1] if fastq2 is not None: reads.append(2) # For each Lane-Index combination, create a file and open a filehandle for sd in sdata: lane = sd['Lane'] index = sd['Index'] if lane not in outfiles: outfiles[lane] = {} counts[lane] = {} outfiles[lane][index] = [] counts[lane][index] = 0 for read in reads: fname = "tmp_{}_{}_L00{}_R{}_001.fastq.gz".format(sd['SampleID'], index, lane, read) outfiles[lane][index].append(FastQWriter(os.path.join(outdir,fname))) # Parse the input file(s) and write the records to the appropriate output files fhs = [FastQParser(fastq1)] if fastq2 is not None: fhs.append(FastQParser(fastq2)) for r, fh in enumerate(fhs): for record in fh: header = parse_header(record[0]) lane = str(header['lane']) index = header['index'] if lane in outfiles and index in outfiles[lane]: outfiles[lane][index][r].write(record) counts[lane][index] += 1 # Close filehandles and replace the handles with the file names for lane in outfiles.keys(): for index in outfiles[lane].keys(): for r, fh in enumerate(outfiles[lane][index]): fh.close() fname = fh.name() # If no sequences were written, remove the temporary file and the entry from the results if counts[lane][index] == 0: os.unlink(fname) del outfiles[lane][index] break # Rename the temporary file to a persistent name nname = fname.replace("tmp_","") os.rename(fname,nname) outfiles[lane][index][r] = nname return outfiles
def demultiplex_fastq(outdir, samplesheet, fastq1, fastq2=None): """Demultiplex a bcl-converted illumina fastq file. Assumes it has the index sequence in the header a la CASAVA 1.8+ """ outfiles = {} counts = {} sdata = HiSeqRun.parse_samplesheet(samplesheet) reads = [1] if fastq2 is not None: reads.append(2) # For each Lane-Index combination, create a file and open a filehandle for sd in sdata: lane = sd['Lane'] index = sd['Index'] if lane not in outfiles: outfiles[lane] = {} counts[lane] = {} outfiles[lane][index] = [] counts[lane][index] = 0 for read in reads: fname = "tmp_{}_{}_L00{}_R{}_001.fastq.gz".format( sd['SampleID'], index, lane, read) outfiles[lane][index].append( FastQWriter(os.path.join(outdir, fname))) # Parse the input file(s) and write the records to the appropriate output files fhs = [FastQParser(fastq1)] if fastq2 is not None: fhs.append(FastQParser(fastq2)) for r, fh in enumerate(fhs): for record in fh: header = parse_header(record[0]) lane = str(header['lane']) index = header['index'] if lane in outfiles and index in outfiles[lane]: outfiles[lane][index][r].write(record) counts[lane][index] += 1 # Close filehandles and replace the handles with the file names for lane in outfiles.keys(): for index in outfiles[lane].keys(): for r, fh in enumerate(outfiles[lane][index]): fh.close() fname = fh.name() # If no sequences were written, remove the temporary file and the entry from the results if counts[lane][index] == 0: os.unlink(fname) del outfiles[lane][index] break # Rename the temporary file to a persistent name nname = fname.replace("tmp_", "") os.rename(fname, nname) outfiles[lane][index][r] = nname return outfiles
def test_get_project_names(self): """Get the projects from a samplesheet """ # Assert that an empty file returns an empty list fh, ssheet = tempfile.mkstemp(dir=self.rootdir, suffix=".csv") os.close(fh) self.assertListEqual([],HiSeqRun.get_project_names(ssheet), "The list of projects for an empty file is not empty") # Generate artificial samplesheet data data = td.generate_samplesheet_data() projects = {} for d in data: projects[d[-1]] = 1 # Write the data to a samplesheet td._write_samplesheet(data,ssheet) # Assert that the list of projects returned is the same that we generated self.assertListEqual(sorted(projects.keys()),sorted(HiSeqRun.get_project_names(ssheet)), "The list of projects does not match the original list")
def test_get_project_sample_ids(self): """Test that getting the project samples from a samplesheet behaves as expected """ # Generate artificial samplesheet data data = td.generate_samplesheet_data() fh, ssheet = tempfile.mkstemp(dir=self.rootdir, suffix=".csv") os.close(fh) td._write_samplesheet(data,ssheet) # Assert that getting samples for a non-existing project returns an empty list self.assertListEqual([],HiSeqRun.get_project_sample_ids(ssheet,td.generate_project()), "Getting samples for a non-existing project returned unexpected output") # Iterate over the projects and assert that the returned samples are correct samples = {} for row in data: if row[9] not in samples: samples[row[9]] = [] samples[row[9]].append(row[2]) for proj, sample in samples.items(): self.assertListEqual(sorted(sample),sorted(HiSeqRun.get_project_sample_ids(ssheet,proj)), "The returned list of samples did not match the original")
def test_parse_samplesheet(self): """Write and parse a csv-file """ # Assert non-existing file raises exception with self.assertRaises(IOError): HiSeqRun.parse_samplesheet(os.path.join(self.rootdir,'non-existing-samplesheet')) # Write a csv file with some bogus values sdata = td.generate_samplesheet_data() samplesheet = os.path.join(self.rootdir,'SampleSheet.csv') HiSeqRun.write_samplesheet(sdata,samplesheet) # Assert that the written data corresponds to the generated data with open(samplesheet) as fh: # Assert that header is correct self.assertListEqual(HiSeqRun._samplesheet_header(), fh.next().strip().split(","), "Written header does not match expected header") for entry in sdata: # Assert that all rows have the correct values in the correct columns self.assertListEqual([str(e) for e in entry], fh.next().strip().split(","), "Written data row does not match entry in generated samplesheet") # Assert that all rows from samplesheet has been consumed with self.assertRaises(StopIteration): fh.next() # Assert that the parsed data matches the generated data data = HiSeqRun.parse_samplesheet(samplesheet) self.assertEqual(len(sdata), len(data), "Number of parsed entries does not match number of generated entries") for d in data: self.assertListEqual([str(e) for e in sdata.pop(0)], [d[col] for col in HiSeqRun._samplesheet_header()], "Parsed data row does not match entry in generated samplesheet") # Assert that filtering on lane returns expected output lanes = list(set([d["Lane"] for d in data])) obs_lane_data = HiSeqRun.parse_samplesheet(samplesheet,lane=lanes[-1]) exp_lane_data = [d for d in data if str(d["Lane"]) == str(lanes[-1])] self.assertListEqual(sorted(obs_lane_data), sorted(exp_lane_data), "Parsed data row does not match entry in generated samplesheet")
def get_expected(csv_file, lane): """Extract the expected barcodes in a lane from a supplied csv samplesheet """ rows = HiSeqRun.parse_samplesheet(csv_file, lane=lane) return [r["Index"] for r in rows]
def status_query(archive_dir, analysis_dir, flowcell, project, brief): """Get a status report of the progress of flowcells based on a snapshot of the file system """ last_step = 14 status = [] # Process each flowcell in the archive directory for fcdir in IlluminaRun.get_flowcell(archive_dir,flowcell): fc_status = {} fc_status['flowcell'] = os.path.basename(fcdir) # Locate the samplesheet samplesheet = IlluminaRun.get_samplesheet(fcdir) if samplesheet is None: print("{}***ERROR***: Could not locate samplesheet in flowcell directory. Skipping..") continue fc_status['samplesheet'] = samplesheet # Get a list of the projects in the samplesheet projects = HiSeqRun.get_project_names(samplesheet) if len(projects) == 0: print("\t***WARNING***: No projects matched your filter [{}] for flowcell. Skipping..".format(project)) continue fc_status['projects'] = [] # Iterate over the projects in the flowcell for proj in projects: proj = proj.replace("__",".") proj_status = {} proj_status['project'] = proj pdir = bcbio.get_project_analysis_dir(analysis_dir, proj) if not pdir: continue proj_status['project_dir'] = pdir proj_status['samples'] = [] proj_status['no_finished_samples'] = 0 samples = HiSeqRun.get_project_sample_ids(samplesheet, proj) for smpl in samples: smpl = smpl.replace("__",".") sample_status = {} proj_status['samples'].append(sample_status) sample_status['sample_id'] = smpl sdir = bcbio.get_sample_analysis_dir(pdir, smpl) if not sdir: continue sample_status['sample_dir'] = sdir # Match the flowcell we're processing to the sample flowcell directories sample_fc = [d for d in IlluminaRun.get_flowcell(sdir) if d.split("_")[-1] == fcdir.split("_")[-1]] if len(sample_fc) == 0: continue sample_fc = sample_fc[0] sample_status['sample_fc_dir'] = sample_fc fastq_screen = bcbio.get_fastq_screen_folder(sample_fc) if fastq_screen: sample_status['fastq_screen'] = [fastq_screen,bcbio.fastq_screen_finished(fastq_screen)] now = datetime.datetime.now() pipeline_start_indicator = bcbio.get_pipeline_indicator(sample_fc,[1]) if len(pipeline_start_indicator) == 0: continue pipeline_start_indicator = pipeline_start_indicator[0] most_recent, _ = bcbio.get_most_recent_indicator([pipeline_start_indicator]) sample_status['pipeline_started'] = [pipeline_start_indicator,most_recent] most_recent, ifile = bcbio.get_most_recent_indicator(bcbio.get_pipeline_indicator(sample_fc)) sample_status['pipeline_progress'] = [ifile,most_recent] sample_log = bcbio.get_sample_pipeline_log(sample_fc,smpl) if not sample_log: continue st = os.stat(sample_log) sample_status['pipeline_log'] = [sample_log,datetime.datetime.fromtimestamp(st.st_mtime)] jobids = slurm.get_slurm_jobid(smpl) sample_status['slurm_job'] = [] for jobid in jobids: sample_status['slurm_job'].append([jobid,slurm.get_slurm_jobstatus(jobid)]) most_recent, ifile = bcbio.get_most_recent_indicator(bcbio.get_pipeline_indicator(sample_fc,[last_step])) if ifile is not None and sample_status.get('fastq_screen',[None,False])[1]: sample_status['finished'] = True proj_status['no_finished_samples'] += 1 if proj_status['no_finished_samples'] == len(samples): proj_status['finished'] = True fc_status['projects'].append(proj_status) status.append(fc_status) print_status(status,brief)
def get_expected(csv_file, lane): """Extract the expected barcodes in a lane from a supplied csv samplesheet """ rows = HiSeqRun.parse_samplesheet(csv_file,lane=lane) return [r["Index"] for r in rows]
def setUp(self): self.rootdir = tempfile.mkdtemp(prefix="test_illumina_hiseq_") self.hiseq = HiSeqRun(self.rootdir)