def test_no_sample_name(self):
        """
        Throws an exception if the Data portion has no Sample_Name column.
        """

        stub_sample_sheet = """
[Header]
IEMFileVersion,3
Investigator Name,RL
Project Name,11-Jul-2014_nosamplenametest
Experiment Name,11-Jul-2014_nosamplenametest
Date,07/11/2014
Workflow,GenerateFASTQ
Assay,Nextera
Description,Nextera
Chemistry,Amplicon
[Reads]
251
251
[Settings]
[Data]
There,Is,No,Sample,Name
A,B,C,D,E
"""

        with self.assertRaises(ValueError) as assertion:
            sample_sheet_parser(StringIO(stub_sample_sheet))
        self.assertEqual("sample sheet data header does not include Sample_Name",
                         assertion.exception.args[0])
    def test_no_sample_name(self):
        """
        Throws an exception if the Data portion has no Sample_Name column.
        """

        stub_sample_sheet = """
[Header]
IEMFileVersion,3
Investigator Name,RL
Project Name,11-Jul-2014_nosamplenametest
Experiment Name,11-Jul-2014_nosamplenametest
Date,07/11/2014
Workflow,GenerateFASTQ
Assay,Nextera
Description,Nextera
Chemistry,Amplicon
[Reads]
251
251
[Settings]
[Data]
There,Is,No,Sample,Name
A,B,C,D,E
"""

        with self.assertRaises(ValueError) as assertion:
            sample_sheet_parser(StringIO(stub_sample_sheet))
        self.assertEqual("sample sheet data header does not include Sample_Name",
                         assertion.exception.args[0])
示例#3
0
 def process_run(self, run_folder: Path):
     if not (run_folder / 'needsprocessing').exists():
         return False
     if (run_folder / 'errorprocessing').exists():
         return True
     sample_sheet_path = run_folder / 'SampleSheet.csv'
     folder_name = run_folder.name
     with sample_sheet_path.open() as f:
         try:
             run_info = sample_sheet_parser(f)
         except Exception:
             raise RuntimeError(f'Failed to process run {folder_name}.')
     project_groups = defaultdict(list)
     for sample_info in run_info['DataSplit']:
         sample_number = sample_info['sample_number']
         project = sample_info['project']
         project_groups[sample_number].append(project)
     for project_names in project_groups.values():
         project_names.sort()
         name_tuple = tuple(project_names)
         self.project_counts[name_tuple] += 1
         self.latest_dates[name_tuple] = max(self.latest_dates[name_tuple],
                                             folder_name)
     top_projects = self.project_counts.most_common(3)
     summary = ', '.join(f'({", ".join(project_codes)}): {count}'
                         for project_codes, count in top_projects)
     logger.debug('After %s, top counts are: %s', folder_name, summary)
     return True
示例#4
0
def find_groups(file_names, sample_sheet_path, included_projects=None):
    """ Group HCV samples with their MIDI partners.

    :param list[str] file_names: a list of FASTQ file names without paths
    :param sample_sheet_path: path to the SampleSheet.csv file
    :param included_projects: project codes to include, or None to include
        all
    """
    with open(sample_sheet_path) as sample_sheet_file:
        run_info = sample_sheet_parser(sample_sheet_file)

    midi_files = {
        row['sample']: row['filename']
        for row in run_info['DataSplit'] if row['project'] == 'MidHCV'
    }
    wide_names = {
        row['filename']: row['sample']
        for row in run_info['DataSplit'] if (row['project'] != 'MidHCV' and (
            included_projects is None or row['project'] in included_projects))
    }
    trimmed_names = {
        '_'.join(file_name.split('_')[:2]): file_name
        for file_name in file_names
    }
    for trimmed_name, file_name in sorted(trimmed_names.items()):
        sample_name = wide_names.get(trimmed_name)
        if sample_name is None:
            # Project was not included.
            continue
        midi_trimmed = midi_files.get(sample_name + 'MIDI')
        midi_name = trimmed_names.get(midi_trimmed)
        yield SampleGroup(sample_name, (file_name, midi_name))
示例#5
0
def process_folder(result_folder, qai_server, qai_user, qai_password,
                   pipeline_version):
    logger.info('Uploading data to Oracle from {}'.format(result_folder))
    collated_conseqs = os.path.join(result_folder, 'conseq.csv')
    collated_counts = os.path.join(result_folder, 'remap_counts.csv')
    cascade = os.path.join(result_folder, 'cascade.csv')
    coverage_scores = os.path.join(result_folder, 'coverage_scores.csv')
    all_results_path, _ = os.path.split(os.path.normpath(result_folder))
    run_path, _ = os.path.split(all_results_path)
    sample_sheet_file = os.path.join(run_path, "SampleSheet.csv")
    with open(sample_sheet_file, "rU") as f:
        sample_sheet = sample_sheet_parser.sample_sheet_parser(f)

    ok_sample_regions = load_ok_sample_regions(result_folder)

    with qai_helper.Session() as session:
        session.login(qai_server, qai_user, qai_password)

        run = find_run(session, sample_sheet["Experiment Name"])

        with open(collated_conseqs, "rU") as f:
            conseqs = build_conseqs(f, run, sample_sheet, ok_sample_regions)
        with open(coverage_scores, "rU") as f, \
                open(collated_counts, "rU") as f2, \
                open(cascade, "rU") as f3:
            upload_review_to_qai(f, f2, f3, run, sample_sheet, conseqs,
                                 session, pipeline_version)
示例#6
0
def process_run(run_folder: Path, skip_mid_hcv: bool):
    if not (run_folder / 'needsprocessing').exists():
        return False
    if (run_folder / 'errorprocessing').exists():
        return True
    sample_sheet_path = run_folder / 'SampleSheet.csv'
    with sample_sheet_path.open() as f:
        try:
            run_info = sample_sheet_parser(f)
        except Exception:
            raise RuntimeError(f'Failed to process run {run_folder.name}.')
    sample_names = set(run_info['Data'])
    if skip_mid_hcv:
        sample_names = {
            sample_name
            for sample_name in sample_names
            if not re.match(r'.*MidHCV_S\d+$', sample_name)
        }
    cascade_path = run_folder / 'Results' / 'version_7.9' / 'cascade.csv'
    with cascade_path.open() as f:
        reader = DictReader(f)
        cascade_samples = {row['sample'] for row in reader}
    missing_samples = sample_names - cascade_samples
    if missing_samples:
        logger.error('Missing samples in run %s: %s', run_folder.name,
                     sorted(missing_samples))
    return True
    def test_no_index2(self):
        """
        Throws an exception if the Data portion has no Sample_Name column.
        """

        stub_sample_sheet = """
[Header]
IEMFileVersion,3
Investigator Name,RL
Project Name,11-Jul-2014_nosamplenametest
Experiment Name,11-Jul-2014_nosamplenametest
Date,07/11/2014
Workflow,GenerateFASTQ
Assay,Nextera
Description,Nextera
Chemistry,Amplicon
[Reads]
251
251
[Settings]
[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,\
10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \
Disablecontamcheck:Sample1_Proj1:FALSE,
CFE_SomeId_10-Jul-2014_N501_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,\
10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \
Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE,
"""

        ss = sample_sheet_parser(StringIO(stub_sample_sheet))
        sample = ss['Data']['Sample1-Proj1_S1']
        self.assertEqual('ACGTACGT', sample['index1'])
        self.assertEqual('X', sample['index2'])
        self.assertEqual('N501-X', sample['tags'])
    def test_extra_commas(self):
        """
        Throws an exception if the Data portion has no Sample_Name column.
        """

        stub_sample_sheet = """
[Header],,,,,,,
IEMFileVersion,3,,,,,,,
Investigator Name,RL,,,,,,,
Project Name,10-Jul-2014,,,,,,,
Experiment Name,10-Jul-2014,,,,,,,
Date,07/10/2014,,,,,,,
Workflow,GenerateFASTQ,,,,,,,
Assay,Nextera,,,,,,,
Description,Nextera,,,,,,,
Chemistry,Amplicon,,,,,,,
[Reads],,,,,,,
251,,,,,,,
251,,,,,,,
[Settings],,,,,,,
[Data],,,,,,,
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\
10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \
Disablecontamcheck:Sample1_Proj1:FALSE,
CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\
10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \
Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE,
"""

        ss = sample_sheet_parser(StringIO(stub_sample_sheet))
        self.assertEqual(ss["Experiment Name"], "10-Jul-2014")
    def test_no_index2(self):
        """
        Throws an exception if the Data portion has no Sample_Name column.
        """

        stub_sample_sheet = """
[Header]
IEMFileVersion,3
Investigator Name,RL
Project Name,11-Jul-2014_nosamplenametest
Experiment Name,11-Jul-2014_nosamplenametest
Date,07/11/2014
Workflow,GenerateFASTQ
Assay,Nextera
Description,Nextera
Chemistry,Amplicon
[Reads]
251
251
[Settings]
[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,\
10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \
Disablecontamcheck:Sample1_Proj1:FALSE,
CFE_SomeId_10-Jul-2014_N501_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,\
10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \
Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE,
"""

        ss = sample_sheet_parser(StringIO(stub_sample_sheet))
        sample = ss['Data']['Sample1-Proj1_S1']
        self.assertEqual('ACGTACGT', sample['index1'])
        self.assertEqual('X', sample['index2'])
        self.assertEqual('N501-X', sample['tags'])
示例#10
0
    def test_extra_commas(self):
        """
        Throws an exception if the Data portion has no Sample_Name column.
        """

        stub_sample_sheet = """
[Header],,,,,,,
IEMFileVersion,3,,,,,,,
Investigator Name,RL,,,,,,,
Project Name,10-Jul-2014,,,,,,,
Experiment Name,10-Jul-2014,,,,,,,
Date,07/10/2014,,,,,,,
Workflow,GenerateFASTQ,,,,,,,
Assay,Nextera,,,,,,,
Description,Nextera,,,,,,,
Chemistry,Amplicon,,,,,,,
[Reads],,,,,,,
251,,,,,,,
251,,,,,,,
[Settings],,,,,,,
[Data],,,,,,,
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\
10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \
Disablecontamcheck:Sample1_Proj1:FALSE,
CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\
10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \
Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE,
"""

        ss = sample_sheet_parser(StringIO(stub_sample_sheet))
        self.assertEqual(ss["Experiment Name"], "10-Jul-2014")
示例#11
0
 def process_run(self, run_folder: Path):
     if not (run_folder / 'needsprocessing').exists():
         return False
     if (run_folder / 'errorprocessing').exists():
         return True
     sample_sheet_path = run_folder / 'SampleSheet.csv'
     folder_name = run_folder.name
     with sample_sheet_path.open() as f:
         try:
             run_info = sample_sheet_parser(f)
         except Exception:
             raise RuntimeError(f'Failed to process run {folder_name}.')
     project_groups = defaultdict(list)
     for sample_info in run_info['DataSplit']:
         sample_number = sample_info['sample_number']
         project = sample_info['project']
         project_groups[sample_number].append(project)
     for project_names in project_groups.values():
         project_names.sort()
         name_tuple = tuple(project_names)
         self.project_counts[name_tuple] += 1
         self.latest_dates[name_tuple] = max(self.latest_dates[name_tuple], folder_name)
     top_projects = self.project_counts.most_common(3)
     summary = ', '.join(f'({", ".join(project_codes)}): {count}'
                         for project_codes, count in top_projects)
     logger.debug('After %s, top counts are: %s', folder_name, summary)
     return True
示例#12
0
def find_groups(file_names, sample_sheet_path, included_projects=None):
    """ Group HCV samples with their MIDI partners.

    :param list[str] file_names: a list of FASTQ file names without paths
    :param sample_sheet_path: path to the SampleSheet.csv file
    :param included_projects: project codes to include, or None to include
        all
    """
    with open(sample_sheet_path) as sample_sheet_file:
        run_info = sample_sheet_parser(sample_sheet_file)
    overrides_path = Path(sample_sheet_path).parent / 'SampleSheetOverrides.csv'
    if overrides_path.exists():
        with overrides_path.open() as overrides_file:
            read_sample_sheet_overrides(overrides_file, run_info)

    midi_hcv_code = 'MidHCV'
    midi_files = {row['sample']: row['filename']
                  for row in run_info['DataSplit']
                  if row['project'] == midi_hcv_code}
    wide_names = {row['filename']: (row['sample'], row['project'])
                  for row in run_info['DataSplit']
                  if (row['project'] != midi_hcv_code and
                      (included_projects is None or
                       row['project'] in included_projects))}
    trimmed_names = {'_'.join(file_name.split('_')[:2]): file_name
                     for file_name in file_names}
    unused_names = set(trimmed_names.values())
    for trimmed_name, file_name in sorted(trimmed_names.items()):
        sample_entry = wide_names.get(trimmed_name)
        if sample_entry is None:
            # Project was not included.
            continue
        sample_name, project_code = sample_entry
        midi_trimmed = midi_files.get(sample_name + 'MIDI')
        if midi_trimmed is None and sample_name.upper().endswith('WG'):
            sample_name = sample_name[:-2]
            midi_trimmed = midi_files.get(sample_name + 'MIDI')
        midi_name = trimmed_names.get(midi_trimmed)
        unused_names.discard(file_name)
        unused_names.discard(midi_name)
        midi_project = midi_name and midi_hcv_code
        yield SampleGroup(sample_name,
                          (file_name, midi_name),
                          (project_code, midi_project))

    if unused_names:
        sample_names = {file_name: sample_name
                        for sample_name, file_name in midi_files.items()}
        for trimmed_name, file_name in sorted(trimmed_names.items()):
            if file_name in unused_names:
                unused_names.discard(file_name)
                sample_name = sample_names.get(trimmed_name)
                if sample_name is not None:
                    yield SampleGroup(sample_name,
                                      (file_name, None),
                                      (midi_hcv_code, None))
示例#13
0
def test_read_sample_sheet_overrides(tmpdir):
    sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv'
    overrides_path = sample_sheet_path.parent / 'SampleSheetOverrides.csv'

    sample_sheet_path.write_text("""\
[Header]
IEMFileVersion,3
Investigator Name,RL
Project Name,10-Jul-2014_v1test
Experiment Name,10-Jul-2014_v1test
Date,07/10/2014
Workflow,GenerateFASTQ
Assay,Nextera
Description,Nextera
Chemistry,Amplicon
[Reads]
251
251
[Settings]
[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder
CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\
10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \
Disablecontamcheck:Sample1_Proj1:FALSE,
CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\
10-Jul-2014_testing,Research:Sample2_Proj2:FALSE Comments:Sample2_Proj2:thisiscommenttwo \
Chemistry:Sample2_Proj2:BreakingBad Disablecontamcheck:Sample2_Proj2:TRUE,
""")
    overrides_path.write_text("""\
sample,project
Sample2-Proj2_S2,AltB
""")

    with sample_sheet_path.open() as f:
        run_info = sample_sheet_parser(f)

    with overrides_path.open() as f:
        read_sample_sheet_overrides(f, run_info)

    sample_map = run_info['Data']
    assert len(sample_map) == 2

    split_rows = run_info['DataSplit']
    assert len(split_rows) == 2
    assert split_rows[0]['project'] == 'Proj1'
    assert split_rows[1]['project'] == 'AltB'
示例#14
0
    def find(self, source_folder, qai_run_names=None):
        """ Find matching samples in the source folder.

        Puts all the sample names in self.fastq_paths.
        :param str source_folder: the folder to search for samples that match
            self.extract_num
        :param set qai_run_names: a set to add the new run name to as it would
            be formatted on QAI.
        """
        run_path = os.path.join(source_folder, 'MiSeq', 'runs', self.run_name)
        if os.path.exists(run_path):
            self.run_name = run_path
        else:
            run_parts = str(self.run_name).split('.')
            if len(run_parts) < 2:
                run_parts.append('M01841')
            format_string = '%d-%b-%y' if len(
                run_parts[0]) == 9 else '%d-%b-%Y'
            run_date = datetime.strptime(run_parts[0], format_string)
            base_run_name = run_date.strftime('%y%m%d') + '_' + run_parts[1]
            pattern = os.path.join(source_folder, 'MiSeq', 'runs',
                                   base_run_name + '*', NEEDS_PROCESSING)
            matches = glob(pattern)
            if len(matches) != 1:
                raise RuntimeError(
                    'Expected one match for {}, but found: {}'.format(
                        pattern, matches))
            self.run_name = os.path.dirname(matches[0])

        pattern = os.path.join(self.run_name, 'Data', 'Intensities',
                               'BaseCalls', self.extract_num + '*_R1_*')
        matches = glob(pattern)
        if len(matches) == 0:
            raise RuntimeError('No matches found for ' + pattern)
        matches.sort()
        self.fastq_paths = matches
        if qai_run_names is not None:
            sample_sheet_path = os.path.join(self.run_name, 'SampleSheet.csv')
            with open(sample_sheet_path) as f:
                try:
                    sample_sheet = sample_sheet_parser(f)
                except ValueError:
                    print(f'Bad sample sheet for {self.run_name}.')
                else:
                    qai_run_name = sample_sheet['Project Name']
                    qai_run_names.add(qai_run_name)
示例#15
0
def find_groups(file_names, sample_sheet_path, included_projects=None):
    """ Group HCV samples with their MIDI partners.

    :param list[str] file_names: a list of FASTQ file names without paths
    :param sample_sheet_path: path to the SampleSheet.csv file
    :param included_projects: project codes to include, or None to include
        all
    """
    with open(sample_sheet_path) as sample_sheet_file:
        run_info = sample_sheet_parser(sample_sheet_file)

    midi_files = {row['sample']: row['filename']
                  for row in run_info['DataSplit']
                  if row['project'] == 'MidHCV'}
    wide_names = {row['filename']: row['sample']
                  for row in run_info['DataSplit']
                  if (row['project'] != 'MidHCV' and
                      (included_projects is None or
                       row['project'] in included_projects))}
    trimmed_names = {'_'.join(file_name.split('_')[:2]): file_name
                     for file_name in file_names}
    unused_names = set(trimmed_names.values())
    for trimmed_name, file_name in sorted(trimmed_names.items()):
        sample_name = wide_names.get(trimmed_name)
        if sample_name is None:
            # Project was not included.
            continue
        midi_trimmed = midi_files.get(sample_name + 'MIDI')
        if midi_trimmed is None and sample_name.upper().endswith('WG'):
            sample_name = sample_name[:-2]
            midi_trimmed = midi_files.get(sample_name + 'MIDI')
        midi_name = trimmed_names.get(midi_trimmed)
        unused_names.discard(file_name)
        unused_names.discard(midi_name)
        yield SampleGroup(sample_name, (file_name, midi_name))

    if unused_names:
        sample_names = {file_name: sample_name
                        for sample_name, file_name in midi_files.items()}
        for trimmed_name, file_name in sorted(trimmed_names.items()):
            if file_name in unused_names:
                unused_names.discard(file_name)
                sample_name = sample_names.get(trimmed_name)
                if sample_name is not None:
                    yield SampleGroup(sample_name, (file_name, None))
示例#16
0
def process_folder(result_folder,
                   qai_server,
                   qai_user,
                   qai_password,
                   pipeline_version):
    logger.info('Uploading data to Oracle from {}'.format(result_folder))
    collated_conseqs = os.path.join(result_folder, 'conseq.csv')
    collated_counts = os.path.join(result_folder, 'remap_counts.csv')
    cascade = os.path.join(result_folder, 'cascade.csv')
    coverage_scores = os.path.join(result_folder, 'coverage_scores.csv')
    all_results_path, _ = os.path.split(os.path.normpath(result_folder))
    run_path, _ = os.path.split(all_results_path)
    sample_sheet_file = os.path.join(run_path, "SampleSheet.csv")
    with open(sample_sheet_file, "rU") as f:
        sample_sheet = sample_sheet_parser.sample_sheet_parser(f)

    ok_sample_regions = load_ok_sample_regions(result_folder)

    with qai_helper.Session() as session:
        session.login(qai_server,
                      qai_user,
                      qai_password)

        run = find_run(session, sample_sheet["Experiment Name"])

        with open(collated_conseqs, "rU") as f:
            conseqs = build_conseqs(f,
                                    run,
                                    sample_sheet,
                                    ok_sample_regions)
        with open(coverage_scores, "rU") as f, \
                open(collated_counts, "rU") as f2, \
                open(cascade, "rU") as f3:
            upload_review_to_qai(f,
                                 f2,
                                 f3,
                                 run,
                                 sample_sheet,
                                 conseqs,
                                 session,
                                 pipeline_version)
 def setUp(self):
     self.maxDiff = None
     self.ss = sample_sheet_parser(StringIO.StringIO(self.stub_sample_sheet))
示例#18
0
 def setUp(self):
     self.ss = sample_sheet_parser(StringIO(self.stub_sample_sheet))
示例#19
0
def main():
    comm = MPI.COMM_WORLD  # @UndefinedVariable
    process_rank = comm.Get_rank()
    process_count = comm.Get_size()

    args = parseOptions(comm)
    log_file = "{}/pipeline{}.log".format(args.run_folder, process_rank)
    logger = miseq_logging.init_logging(log_file,
                                        file_log_level=logging.DEBUG,
                                        console_log_level=logging.INFO)
    logger.info('Start processing run %s, rank %d',
                args.run_folder,
                process_rank)

    if args.mode is not None:
        run_info = None
    else:
        with open(args.run_folder+'/SampleSheet.csv', 'rU') as sample_sheet:
            logger.debug("sample_sheet_parser({})".format(sample_sheet))
            run_info = sample_sheet_parser(sample_sheet)
            args.mode = run_info['Description']

    fastq_samples = []
    fastq_files = glob(args.run_folder + '/*_R1_001.fastq')
    for i, fastq in enumerate(fastq_files):
        if i % process_count != process_rank:
            # skip samples that are assigned to other worker processes
            continue

        sample_info = SampleInfo(fastq)

        # verify this sample is in SampleSheet.csv
        if run_info and sample_info.key not in run_info['Data']:
            logger.error(
                '{} not in SampleSheet.csv - cannot map this sample'.format(
                    sample_info.key))
            continue

        fastq_samples.append(sample_info)

    def launch_callback(command):
        logger.info("Launching {!r}".format(command))

    worker = Worker(launch_callback=launch_callback,
                    working_path=args.run_folder,
                    are_temp_folders_deleted=are_temp_folders_deleted,
                    logger=logger)

    if args.phase in ('filter', 'all'):
        filter_quality(args.run_folder, worker)

    if args.phase in ('mapping', 'all'):
        map_samples(args.run_folder, fastq_samples, worker)

    if args.phase in ('counting', 'all'):
        count_samples(fastq_samples, worker, args)

    if args.phase in ('summarizing', 'all') and process_rank == 0:
        collate_results(fastq_samples, worker, args, logger)

    # FIXME: this log message gets sent before workers start
    logger.info('Finish processing run %s, rank %d',
                args.run_folder,
                process_rank)
示例#20
0
 def setUp(self):
     self.ss = sample_sheet_parser(StringIO(self.stub_sample_sheet))
示例#21
0
 def setUp(self):
     self.maxDiff = None
     self.ss = sample_sheet_parser(StringIO.StringIO(self.stub_sample_sheet))