示例#1
0
def parse_sample_list(sample_sheet_file, run_data_directory, run_data_directory_file_list):
    """
    Creates a list of Sample Objects

    :param sample_sheet_file: Sample Sheet file
    :param run_data_directory: Data directory including run directory (e.g. my_run/Data/Intensities/BaseCalls)
    :param run_data_directory_file_list: The list of all files in the data directory
    :return: list of Sample objects
    """
    sample_list = _parse_samples(sample_sheet_file)

    for sample in sample_list:
        properties_dict = _parse_out_sequence_file(sample)
        # this is the Illumina-defined pattern for naming fastq files, from:
        # http://blog.basespace.illumina.com/2014/08/18/fastq-upload-in-now-available-in-basespace/
        file_pattern = "{sample_name}_S{sample_number}_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(
            sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number)
        logging.info("Looking for files with pattern {}".format(file_pattern))
        regex = re.compile(file_pattern)
        pf_list = list(filter(regex.search, run_data_directory_file_list))
        if not pf_list:
            # OK. So we didn't find any files using the **correct** file name
            # definition according to Illumina. Let's try again with our deprecated
            # behaviour, where we didn't actually care about the sample number:
            file_pattern = "{sample_name}_S\\d+_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format(
                sample_name=re.escape(sample.sample_name))
            logging.info("Looking for files with pattern {}".format(file_pattern))

            regex = re.compile(file_pattern)
            pf_list = list(filter(regex.search, run_data_directory_file_list))

            if not pf_list:
                # we **still** didn't find anything. It's pretty likely, then that
                # there aren't any fastq files in the directory that match what
                # the sample sheet says...
                raise exceptions.SequenceFileError(
                    ("The uploader was unable to find an files with a file name that ends with "
                     ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. "
                     "This usually happens when the Illumina MiSeq Reporter tool "
                     "does not generate any FastQ data.").format(
                        sample.sample_name, run_data_directory))

        # List of files may be invalid if directory searching in has been modified by user
        if not _validate_pf_list(pf_list):
            raise exceptions.SequenceFileError(
                ("The following file list {} found in the directory {} is invalid. "
                 "Please verify the folder containing the sequence files matches the SampleSheet file").format(
                    pf_list, run_data_directory))

        # Add the dir to each file to create the full path
        for i in range(len(pf_list)):
            pf_list[i] = path.join(run_data_directory, pf_list[i])

        sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict)
        sample.sequence_file = deepcopy(sq)

    return sample_list
示例#2
0
def _parse_sample_list(sample_sheet_file):
    """
    Creates a list of all samples in the sample_sheet_file, with accompanying data/metadata

    :param sample_sheet_file:
    :return: list of samples
    """
    sample_list = _parse_samples(sample_sheet_file)
    sample_sheet_dir = path.dirname(sample_sheet_file)
    base_data_dir = path.join(sample_sheet_dir, "Data", "Intensities",
                              "BaseCalls")
    project_dir_list = next(walk(base_data_dir))[
        1]  # Get the list of project directories that contain sample files

    for sample in sample_list:

        project_directory = sample.get('sample_project')
        if project_directory not in project_dir_list:
            # The project number in the sample sheet does not match a project folder in the run directory
            raise exceptions.SequenceFileError(
                "The uploader was unable to find the directory '{}' in '{}'. "
                "Please verify your SampleSheet Sample_Project matches the directory structure"
                .format(project_directory, base_data_dir))
        project_data_dir = path.join(base_data_dir, project_directory)
        # Create a file list of the data directory, only hit the os once
        data_dir_file_list = next(walk(project_data_dir))[2]

        properties_dict = _parse_out_sequence_file(sample)
        file_pattern = "{sample_name}_S(\\S+)_R(\\d+)_(\\S*)\\.fastq.*$".format(
            sample_name=re.escape(sample.sample_name))
        logging.info("Looking for files with pattern {}".format(file_pattern))
        regex = re.compile(file_pattern)
        pf_list = list(filter(regex.search, data_dir_file_list))

        if not pf_list:
            # we didn't find anything
            raise exceptions.SequenceFileError((
                "The uploader was unable to find an files with a file name that ends with "
                ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. "
            ).format(sample.sample_name, project_data_dir))

        # List of files may be invalid if directory searching in has been modified by user
        if not _validate_pf_list(pf_list):
            raise exceptions.SequenceFileError((
                "The following file list {} found in the directory {} is invalid. "
                "Please verify the folder containing the sequence files matches the SampleSheet file"
            ).format(pf_list, project_data_dir))

        # Add the dir to each file to create the full path
        for i in range(len(pf_list)):
            pf_list[i] = path.join(project_data_dir, pf_list[i])

        sq = model.SequenceFile(file_list=pf_list,
                                properties_dict=properties_dict)
        sample.sequence_file = deepcopy(sq)

    return sample_list