def parse_directory(self): """Traverse a CASAVA 1.8+ generated directory structure and return a dictionary """ raise NotImplementedError("This method is not yet implemented") projects = [] # Create a Flowcell object fc = Flowcell() fc.filename = self.samplesheet_file fc.fc_id() unaligned_dir = self.get_sequence_dir() basecall_stats_dir = self.get_basecall_stats() project_dir_pattern = os.path.join(unaligned_dir,"Project_*") for project_dir in glob.glob(project_dir_pattern): project_samples = [] sample_dir_pattern = os.path.join(project_dir,"Sample_*") for sample_dir in glob.glob(sample_dir_pattern): fastq_file_pattern = os.path.join(sample_dir,"*.fastq.gz") samplesheet_pattern = os.path.join(sample_dir,"*.csv") fastq_files = [os.path.basename(file) for file in glob.glob(fastq_file_pattern)] samplesheet = glob.glob(samplesheet_pattern) assert len(samplesheet) == 1, "ERROR: Could not unambiguously locate samplesheet in %s" % sample_dir sample_name = sample_dir.replace(sample_dir_pattern[0:-1],'') project_samples.append({'sample_dir': os.path.relpath(sample_dir,project_dir), 'sample_name': sample_name, 'files': fastq_files, 'samplesheet': os.path.basename(samplesheet[0])}) project_name = project_dir.replace(project_dir_pattern[0:-1],'') projects.append({'project_dir': os.path.relpath(project_dir,unaligned_dir), 'project_name': project_name, 'samples': project_samples}) return {'fc_dir': fc_dir, 'fc_name': fc_name, 'fc_date': fc_date, 'data_dir': os.path.relpath(unaligned_dir,fc_dir), 'basecall_stats_dir': basecall_stats_dir, 'projects': projects}
def parse_directory(self): """Traverse a CASAVA 1.8+ generated directory structure and return a dictionary """ raise NotImplementedError("This method is not yet implemented") projects = [] # Create a Flowcell object fc = Flowcell() fc.filename = self.samplesheet_file fc.fc_id() unaligned_dir = self.get_sequence_dir() basecall_stats_dir = self.get_basecall_stats() project_dir_pattern = os.path.join(unaligned_dir, "Project_*") for project_dir in glob.glob(project_dir_pattern): project_samples = [] sample_dir_pattern = os.path.join(project_dir, "Sample_*") for sample_dir in glob.glob(sample_dir_pattern): fastq_file_pattern = os.path.join(sample_dir, "*.fastq.gz") samplesheet_pattern = os.path.join(sample_dir, "*.csv") fastq_files = [ os.path.basename(file) for file in glob.glob(fastq_file_pattern) ] samplesheet = glob.glob(samplesheet_pattern) assert len( samplesheet ) == 1, "ERROR: Could not unambiguously locate samplesheet in %s" % sample_dir sample_name = sample_dir.replace(sample_dir_pattern[0:-1], '') project_samples.append({ 'sample_dir': os.path.relpath(sample_dir, project_dir), 'sample_name': sample_name, 'files': fastq_files, 'samplesheet': os.path.basename(samplesheet[0]) }) project_name = project_dir.replace(project_dir_pattern[0:-1], '') projects.append({ 'project_dir': os.path.relpath(project_dir, unaligned_dir), 'project_name': project_name, 'samples': project_samples }) return { 'fc_dir': fc_dir, 'fc_name': fc_name, 'fc_date': fc_date, 'data_dir': os.path.relpath(unaligned_dir, fc_dir), 'basecall_stats_dir': basecall_stats_dir, 'projects': projects }
def _from_pre_casava_structure(self): if not self._check_pargs(["project", "flowcell"]): return fc = Flowcell() fc.load([os.path.join(x, self.pargs.flowcell) for x in [self.config.get("archive", "root"), self.config.get("production", "root")]]) indir = os.path.join(self.config.get("production", "root"), self.pargs.flowcell) if not fc: self.log.warn("No run information available for {}".format(self.pargs.flowcell)) return fc_new = fc.subset("sample_prj", self.pargs.project) fc_new.collect_files(indir) return fc_new
def _from_casava_structure(self): """Get information from casava structure""" if not self._check_pargs(["project"]): return fc_list = [] pattern = "-bcbb-config.yaml$" def bcbb_yaml_filter(f): return re.search(pattern, f) != None samples = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), bcbb_yaml_filter) for s in samples: fc = Flowcell(s) fc_new = fc.subset("sample_prj", self.pargs.project) fc_new.collect_files(os.path.dirname(s)) fc_list.append(fc_new) return fc_list
def samplesheet_csv_to_yaml(fn): """Convert SampleSheet.csv to bcbb-config.yaml file. :param fn: input file """ fc = Flowcell(infile=fn) bc_id = 1 for s in fc.samples: sequence = fc.get_entry(s, "sequence") name = fc.get_entry(s, "name") # Currently only look for casava-based pattern pat = os.path.join(os.path.dirname(fn), "{}_{}*fastq*".format(name, sequence)) seqfiles = glob.glob(pat) if seqfiles: seqfiles.sort() fc.set_entry(s, "files", seqfiles) fc.set_entry(s, "barcode_id", bc_id) if bc_id == 1: flowcell_id = fc.get_entry(s, "flowcell_id").split("-")[1] bc_id = bc_id + 1 fc.fc_date = datetime.datetime.now().strftime("%y%m%d") fc.fc_name = flowcell_id outfile = os.path.join(os.path.dirname(fn), "{}-bcbb-config.yaml".format(name)) with open(outfile, "w") as fh: fh.write(fc.as_yaml())