def process(self): # Reduce inputs to only first element if hasattr(self.input_dir, '__iter__'): self.input_dir = self.input_dir[0] self.input_dir = os.path.join(self.input_dir, "Data/Intensities/BaseCalls/") if type(self.sample_sheet) == list: if len(self.sample_sheet) > 1: raise Exception('Too many sample sheet files: %s' % ','.join(self.sample_sheet)) else: self.sample_sheet = self.sample_sheet[0] ss = SampleSheet(self.sample_sheet) mask_length, double_idx = ss.get_mask_length() if double_idx: self.use_base_mask = "y*,I{0},I{0},Y*".format(mask_length) else: self.use_base_mask = "y*,I{0},Y*".format(mask_length) self.use_base_mask = str(self.use_base_mask) super(CasavaDemux, self).process() prj_dir = os.path.join(self.output_dir, 'Project_' + self.meta['pipeline']['project_name']) self.output_files = utils.find(prj_dir, "*.fastq.gz") #set the metadata self.meta['job']['sample_id'] = [] sample_ids = ss.get_sample_ids() for output_file in self.output_files: for sample_id in sample_ids: if os.path.basename(output_file).startswith("%s_" % sample_id): self.meta['job']['sample_id'].append(sample_id) break
def process(self): # Reduce inputs to only first element if hasattr(self.input_dir, '__iter__'): self.input_dir = self.input_dir[0] if not self.input_dir.endswith('/'): self.input_dir += '/' (parent_dir, flowcell_dir) = os.path.split(os.path.dirname(self.input_dir)) parsed = re.search(r'''(?P<DATE>\d{6})_ (?P<HISEQ_SN>\w{6})_ (?P<RUN_COUNT>\d{4})_ (?P<FC_POS>[AB]) (?P<FC_ID>.*$)''', flowcell_dir, re.X) ss = SampleSheet(os.path.join(self.input_dir, 'SampleSheet.csv')) ss_validated = os.path.join(self.output_dir, 'sample_sheet_validated.csv') project_name = ss.get_project_name() or 'DefaultProject' run_desc = 'Flowcell %s on %s/%s' % (parsed.group('FC_ID'), os.path.basename(parent_dir), parsed.group('FC_POS')) self.meta.update({ 'pipeline': { 'date' : parsed.group('DATE'), 'descr' : run_desc, 'fc_id' : parsed.group('FC_ID'), 'fc_pos' : parsed.group('FC_POS'), 'hiseq' : os.path.basename(parent_dir), 'hiseq_sn' : parsed.group('HISEQ_SN'), 'project_name' : project_name, 'run_count' : int(parsed.group('RUN_COUNT')), 'nfiles' : ss.get_lines_count() } }) ss_validated = ss.validate(project_name, ss_validated) self.output_files = [ss_validated]
def create(sample_sheet, input_dir, output_dir=None, output_file_name=None): """ Crete a file of file names and return the path to it Args: sample_sheet: full path to the sample sheet input_dir: path to the directory containing the input files output_file: name of the output fofn. If is not specified the name """ if not os.path.exists(sample_sheet): raise Exception("input error: parameter `sample_sheet` %s does not exist" % sample_sheet) if not os.path.exists(input_dir): raise Exception("input error: parameter `input_dir` %s does not exist" % sample_sheet) print("*********************************") print("sample_sheet: %s" % os.path.abspath(sample_sheet)) print("input_dir: %s" % os.path.abspath(input_dir)) print("*********************************") #set default name of the output fofn if not output_file_name: output_file_name = os.path.basename(sample_sheet).rsplit(".", 1)[0] + "_fofn.csv" if not output_dir: output_dir = os.path.dirname(sample_sheet) output_file = os.path.join(output_dir, output_file_name) with open(output_file, 'w') as f_fofn: ss = SampleSheet(sample_sheet) sample_id_list = ss.get_sample_ids() for sample_id in sample_id_list: print("*********************************") print "sample_id : %s" %sample_id for root, dirs, file_list in os.walk(input_dir): #group the files by sample id and read number r1_files = [ os.path.join(root, file_name) for file_name in file_list if ( '%s_'%sample_id in file_name and Fofn.r1_regex.search(file_name) ) ] r2_files = [ os.path.join(root, file_name) for file_name in file_list if ( '%s_'%sample_id in file_name and Fofn.r2_regex.search(file_name) ) ] r1_file = "" r2_file = "" if r1_files: for r1_file in r1_files: #filter the R2 files that match the R1 file base r2_matchs = [ r2_file for r2_file in r2_files if ( Fofn.r1_regex.search(r1_file).group(1) in r2_file) ] if r2_matchs: Fofn._write_record(f_fofn, r1_file, r2_matchs[0], sample_id) else: if r2_files: print("No R2 found for sample Id %s" % sample_id) Fofn._write_record(f_fofn, r1_file, '', sample_id) else: if r2_files: for r2_file in r2_files: Fofn._write_record(f_fofn, r1_file, r2_file, sample_id) return output_file
def create(sample_sheet, input_dir, output_dir=None, output_file_name=None): """ Crete a file of file names and return the path to it Args: sample_sheet: full path to the sample sheet input_dir: path to the directory containing the input files output_file: name of the output fofn. If is not specified the name """ if not os.path.exists(sample_sheet): raise Exception( "input error: parameter `sample_sheet` %s does not exist" % sample_sheet) if not os.path.exists(input_dir): raise Exception( "input error: parameter `input_dir` %s does not exist" % sample_sheet) print("*********************************") print("sample_sheet: %s" % os.path.abspath(sample_sheet)) print("input_dir: %s" % os.path.abspath(input_dir)) print("*********************************") #set default name of the output fofn if not output_file_name: output_file_name = os.path.basename(sample_sheet).rsplit( ".", 1)[0] + "_fofn.csv" if not output_dir: output_dir = os.path.dirname(sample_sheet) output_file = os.path.join(output_dir, output_file_name) with open(output_file, 'w') as f_fofn: ss = SampleSheet(sample_sheet) sample_id_list = ss.get_sample_ids() for sample_id in sample_id_list: print("*********************************") print "sample_id : %s" % sample_id for root, dirs, file_list in os.walk(input_dir): #group the files by sample id and read number r1_files = [ os.path.join(root, file_name) for file_name in file_list if ('%s_' % sample_id in file_name and Fofn.r1_regex.search(file_name)) ] r2_files = [ os.path.join(root, file_name) for file_name in file_list if ('%s_' % sample_id in file_name and Fofn.r2_regex.search(file_name)) ] r1_file = "" r2_file = "" if r1_files: for r1_file in r1_files: #filter the R2 files that match the R1 file base r2_matchs = [ r2_file for r2_file in r2_files if (Fofn.r1_regex.search(r1_file).group(1) in r2_file) ] if r2_matchs: Fofn._write_record(f_fofn, r1_file, r2_matchs[0], sample_id) else: if r2_files: print("No R2 found for sample Id %s" % sample_id) Fofn._write_record(f_fofn, r1_file, '', sample_id) else: if r2_files: for r2_file in r2_files: Fofn._write_record(f_fofn, r1_file, r2_file, sample_id) return output_file