Пример #1
0
    def process(self):

        # Reduce inputs to only first element
        if hasattr(self.input_dir, '__iter__'):
            self.input_dir = self.input_dir[0]

        self.input_dir = os.path.join(self.input_dir, "Data/Intensities/BaseCalls/")
        if type(self.sample_sheet) == list:
            if len(self.sample_sheet) > 1:
                raise Exception('Too many sample sheet files: %s' % ','.join(self.sample_sheet))
            else:
                self.sample_sheet = self.sample_sheet[0]
            
        ss = SampleSheet(self.sample_sheet) 
        mask_length, double_idx = ss.get_mask_length()

        if double_idx:
            self.use_base_mask = "y*,I{0},I{0},Y*".format(mask_length)
        else:
            self.use_base_mask = "y*,I{0},Y*".format(mask_length)

        self.use_base_mask = str(self.use_base_mask)
        super(CasavaDemux, self).process()

        prj_dir = os.path.join(self.output_dir, 'Project_' + self.meta['pipeline']['project_name'])
        self.output_files = utils.find(prj_dir, "*.fastq.gz")

        #set the metadata
        self.meta['job']['sample_id'] = []
        sample_ids = ss.get_sample_ids()
        for output_file in self.output_files:
            for sample_id in sample_ids:
                if os.path.basename(output_file).startswith("%s_" % sample_id):
                    self.meta['job']['sample_id'].append(sample_id)
                    break
Пример #2
0
    def process(self):

        # Reduce inputs to only first element
        if hasattr(self.input_dir, '__iter__'):
            self.input_dir = self.input_dir[0]

        if not self.input_dir.endswith('/'):
            self.input_dir += '/'

        (parent_dir, flowcell_dir) = os.path.split(os.path.dirname(self.input_dir))

        parsed = re.search(r'''(?P<DATE>\d{6})_
                               (?P<HISEQ_SN>\w{6})_
                               (?P<RUN_COUNT>\d{4})_
                               (?P<FC_POS>[AB])
                               (?P<FC_ID>.*$)''', flowcell_dir, re.X)

        ss = SampleSheet(os.path.join(self.input_dir, 'SampleSheet.csv'))
        ss_validated = os.path.join(self.output_dir, 'sample_sheet_validated.csv')
        project_name = ss.get_project_name() or 'DefaultProject'

        run_desc = 'Flowcell %s on %s/%s' % (parsed.group('FC_ID'),
                                             os.path.basename(parent_dir),
                                             parsed.group('FC_POS'))

        self.meta.update({
                'pipeline': {
                    'date'         : parsed.group('DATE'),
                    'descr'        : run_desc,
                    'fc_id'        : parsed.group('FC_ID'),
                    'fc_pos'       : parsed.group('FC_POS'),
                    'hiseq'        : os.path.basename(parent_dir),
                    'hiseq_sn'     : parsed.group('HISEQ_SN'),
                    'project_name' : project_name,
                    'run_count'    : int(parsed.group('RUN_COUNT')),
                    'nfiles'       : ss.get_lines_count()
                }
            })

        ss_validated = ss.validate(project_name, ss_validated)

        self.output_files = [ss_validated]
Пример #3
0
    def create(sample_sheet, input_dir, output_dir=None, output_file_name=None):
        """
        Crete a file of file names and return the path to it

        Args:
             sample_sheet:
                full path to the sample sheet

             input_dir:
                path to the directory containing the input files

             output_file:
                name of the output fofn. If is not specified the name
        """


        if not os.path.exists(sample_sheet):
            raise Exception("input error: parameter `sample_sheet` %s does not exist" % sample_sheet)

        if not os.path.exists(input_dir):
            raise Exception("input error: parameter `input_dir` %s does not exist" % sample_sheet)


        print("*********************************")
        print("sample_sheet: %s" % os.path.abspath(sample_sheet))
        print("input_dir: %s" % os.path.abspath(input_dir))
        print("*********************************")

        #set default name of the output fofn
        if not output_file_name:
            output_file_name = os.path.basename(sample_sheet).rsplit(".", 1)[0] + "_fofn.csv"

        if not output_dir:
            output_dir = os.path.dirname(sample_sheet)

        output_file = os.path.join(output_dir, output_file_name)

        with open(output_file, 'w') as f_fofn:
            ss = SampleSheet(sample_sheet)
            sample_id_list = ss.get_sample_ids()
            for sample_id in sample_id_list:
                print("*********************************")
                print "sample_id : %s" %sample_id
                for root, dirs, file_list in os.walk(input_dir):
                    #group the files by sample id and read number
                    r1_files = [
                        os.path.join(root, file_name) for file_name in file_list if (
                            '%s_'%sample_id in file_name and Fofn.r1_regex.search(file_name)
                        )
                    ]
                    r2_files = [
                        os.path.join(root, file_name) for file_name in file_list if (
                            '%s_'%sample_id in file_name and Fofn.r2_regex.search(file_name)
                        )
                    ]

                    r1_file = ""
                    r2_file = ""

                    if r1_files:
                        for r1_file in r1_files:
                            #filter the R2 files that match the R1 file base
                            r2_matchs = [
                                r2_file for r2_file in r2_files if (
                                    Fofn.r1_regex.search(r1_file).group(1) in r2_file)
                            ]
                            if r2_matchs:
                                Fofn._write_record(f_fofn, r1_file, r2_matchs[0], sample_id)
                            else:
                                if r2_files:
                                    print("No R2 found for sample Id %s" % sample_id)
                                Fofn._write_record(f_fofn, r1_file, '', sample_id)
                    else:
                        if r2_files:
                            for r2_file in r2_files:
                                Fofn._write_record(f_fofn, r1_file, r2_file, sample_id)


        return output_file
Пример #4
0
    def create(sample_sheet,
               input_dir,
               output_dir=None,
               output_file_name=None):
        """
        Crete a file of file names and return the path to it

        Args:
             sample_sheet:
                full path to the sample sheet

             input_dir:
                path to the directory containing the input files

             output_file:
                name of the output fofn. If is not specified the name
        """

        if not os.path.exists(sample_sheet):
            raise Exception(
                "input error: parameter `sample_sheet` %s does not exist" %
                sample_sheet)

        if not os.path.exists(input_dir):
            raise Exception(
                "input error: parameter `input_dir` %s does not exist" %
                sample_sheet)

        print("*********************************")
        print("sample_sheet: %s" % os.path.abspath(sample_sheet))
        print("input_dir: %s" % os.path.abspath(input_dir))
        print("*********************************")

        #set default name of the output fofn
        if not output_file_name:
            output_file_name = os.path.basename(sample_sheet).rsplit(
                ".", 1)[0] + "_fofn.csv"

        if not output_dir:
            output_dir = os.path.dirname(sample_sheet)

        output_file = os.path.join(output_dir, output_file_name)

        with open(output_file, 'w') as f_fofn:
            ss = SampleSheet(sample_sheet)
            sample_id_list = ss.get_sample_ids()
            for sample_id in sample_id_list:
                print("*********************************")
                print "sample_id : %s" % sample_id
                for root, dirs, file_list in os.walk(input_dir):
                    #group the files by sample id and read number
                    r1_files = [
                        os.path.join(root, file_name)
                        for file_name in file_list
                        if ('%s_' % sample_id in file_name
                            and Fofn.r1_regex.search(file_name))
                    ]
                    r2_files = [
                        os.path.join(root, file_name)
                        for file_name in file_list
                        if ('%s_' % sample_id in file_name
                            and Fofn.r2_regex.search(file_name))
                    ]

                    r1_file = ""
                    r2_file = ""

                    if r1_files:
                        for r1_file in r1_files:
                            #filter the R2 files that match the R1 file base
                            r2_matchs = [
                                r2_file for r2_file in r2_files
                                if (Fofn.r1_regex.search(r1_file).group(1) in
                                    r2_file)
                            ]
                            if r2_matchs:
                                Fofn._write_record(f_fofn, r1_file,
                                                   r2_matchs[0], sample_id)
                            else:
                                if r2_files:
                                    print("No R2 found for sample Id %s" %
                                          sample_id)
                                Fofn._write_record(f_fofn, r1_file, '',
                                                   sample_id)
                    else:
                        if r2_files:
                            for r2_file in r2_files:
                                Fofn._write_record(f_fofn, r1_file, r2_file,
                                                   sample_id)

        return output_file