예제 #1
0
    def test_get_csv_reader_no_sheet(self):
        """
        Make sure an error is raised if the csv reader is not given a valid sample sheet
        :return:
        """
        sheet_file = path.join(path_to_module, "fake_ngs_data", "Alignment_1")

        with self.assertRaises(SampleSheetError):
            common.get_csv_reader(sheet_file)
예제 #2
0
    def test_get_csv_reader_no_sheet(self):
        """
        When no sheet is given to parser, throw error
        :return:
        """
        sheet_file = os.path.join(path_to_module, "fake_dir_data")

        with self.assertRaises(SampleSheetError):
            common.get_csv_reader(sheet_file)
예제 #3
0
def parse_metadata(sample_sheet_file):

    """
    Parse all lines under [Header], [Reads] and [BCLConvert_Settings] in .csv file
    Lines under [Reads] are stored in a list with key name "readLengths"
    All other key names are translated according to the
        metadata_key_translation_dict

    arguments:
            sample_sheet_file -- path to UploadList.csv

    returns a dictionary containing the parsed key:pair values from .csv file
    """

    metadata_dict = {"readLengths": [], "indexCycles": []}

    csv_reader = common.get_csv_reader(sample_sheet_file)

    section = None

    for line in csv_reader:
        if "[Header]" in line or "[BCLConvert_Settings]" in line:
            section = "header"
            continue
        elif "[Reads]" in line:
            section = "reads"
            continue
        elif "[BCLConvert_Data]" in line:
            break
        elif line and line[0].startswith("["):
            section = "unknown"
            continue

        if not line or not line[0]:
            continue

        if not section:
            logging.debug("Sample sheet is missing important sections: no sections were found")
            raise exceptions.SampleSheetError("Sample sheet is missing important sections: no sections were found.",
                                              sample_sheet_file)
        elif section == "reads":
            if line[0] == "Read1Cycles" or line[0] == "Read2Cycles":
                metadata_dict["readLengths"].append(line[1])
            elif line[0] == "Index1Cycles" or line[0] == "Index2Cycles":
                metadata_dict["indexCycles"].append(line[1])

    # currently sends just the larger readLengths
    if len(metadata_dict["readLengths"]) == 2:
        metadata_dict["layoutType"] = "PAIRED_END"
    elif len(metadata_dict["readLengths"]) == 1:
        metadata_dict["layoutType"] = "SINGLE_END"
    else:
        logging.debug("The sample sheet has invalid [Reads] sections: [Reads] section should have 2 or 4 entries.")
        raise exceptions.SampleSheetError("The sample sheet has invalid [Reads] sections: "
                                          "[Reads] section should have 2 or 4 entries.",
                                          sample_sheet_file)
    metadata_dict["readLengths"] = max(metadata_dict["readLengths"])
    metadata_dict["indexCycles"] = max(metadata_dict["indexCycles"])

    return metadata_dict
예제 #4
0
    def test_get_csv_reader_valid_sheet(self):
        """
        Given a valid sample sheet, ensure the parsed sheet matches expected output
        :return:
        """
        sheet_file = os.path.join(path_to_module, "test_csv_reader.csv")

        lines = common.get_csv_reader(sheet_file)
        # This is a sample of what the miseq sample sheet looks like, but it also makes a good
        # example for what we want our csv reader to be able to parse.
        correct_lines = [
            ['[Header]'],
            ['IEMFileVersion', '4'],
            ['Investigator Name', 'Some Guy'],
            ['Experiment Name', '1'],
            ['Date', '10/15/2013'],
            ['Workflow', 'GenerateFASTQ'],
            ['Application', 'FASTQ Only'],
            ['Assay', 'Nextera XT'],
            ['Description', 'Superbug'],
            ['Chemistry', 'Amplicon'],
            [],
            ['[Reads]'],
            ['251'],
            ['250'],
            [],
            ['[Settings]'],
            ['ReverseComplement', '0'],
            ['Adapter', 'AAAAGGGGAAAAGGGGAAA'],
            [],
            ['[Data]'],
            ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'Sample_Well', 'I7_Index_ID', 'index', 'I5_Index_ID', 'index2',
             'Sample_Project', 'Description'],
            ['01-1111', '01-1111', '1', '01', 'N01', 'AAAAAAAA', 'S01', 'TTTTTTTT', '6', 'Super bug '],
            ['02-2222', '02-2222', '2', '02', 'N02', 'GGGGGGGG', 'S02', 'CCCCCCCC', '6', 'Scary bug '],
            ['03-3333', '03-3333', '3', '03', 'N03', 'CCCCCCCC', 'S03', 'GGGGGGGG', '6', 'Deadly bug ']
        ]

        for line, c_line in zip(lines, correct_lines):
            self.assertEqual(line, c_line)
예제 #5
0
    def test_get_csv_reader_valid_sheet(self):
        """
        Given a valid sample sheet, ensure the parsed sheet matches expected output
        :return:
        """
        sheet_file = path.join(path_to_module, "fake_ngs_data",
                               "SampleSheet.csv")

        lines = common.get_csv_reader(sheet_file)

        correct_lines = [['[Header]'],
                         ['Local Run Manager Analysis Id', '4004'],
                         ['Experiment Name', '1'], ['Date', '10/15/2013'],
                         ['Workflow', 'GenerateFastQWorkflow'],
                         ['Description',
                          'Superbug'], ['Chemistry', 'Amplicon'], [],
                         ['[Reads]'], ['151'], ['151'], [], ['[Settings]'],
                         ['Adapter', 'AAAAGGGGAAAAGGGGAAA'], [], ['[Data]'],
                         [
                             'Sample_ID', 'Sample_Name', 'index',
                             'I7_Index_ID', 'index2', 'I5_Index_ID',
                             'Sample_Project'
                         ],
                         [
                             '01-1111-4004', '01-1111', 'AAAAAAAA', 'N01',
                             'TTTTTTTT', 'S01', '6'
                         ],
                         [
                             '02-2222-4004', '02-2222', 'GGGGGGGG', 'N02',
                             'CCCCCCCC', 'S02', '6'
                         ],
                         [
                             '03-3333-4004', '03-3333', 'CCCCCCCC', 'N03',
                             'GGGGGGGG', 'S03', '6'
                         ]]

        for line, c_line in zip(lines, correct_lines):
            self.assertEqual(line, c_line)
예제 #6
0
def _parse_samples(sample_sheet_file):

    """
    Parse all the lines under "[Data]" in .csv file
    Keys in sample_key_translation_dict have their values changed for
        uploading to REST API
    All other keys keep the same name that they have in .csv file

    arguments:
            sample_sheet_file -- path to UploadList.csv

    returns	a list containing Sample objects that have been created by a
        dictionary from the parsed out key:pair values from .csv file
    """

    logging.info("Reading data from sample sheet {}".format(sample_sheet_file))

    csv_reader = common.get_csv_reader(sample_sheet_file)
    # start with an ordered dictionary so that keys are ordered in the same
    # way that they are inserted.
    sample_dict = OrderedDict()
    sample_list = []

    sample_key_translation_dict = {
        'Sample_ID': 'sampleName',
        'Sample_Project': 'sample_project'
    }

    _parse_samples.sample_key_translation_dict = sample_key_translation_dict

    # initialize dictionary keys from first line (data headers/attributes)
    set_attributes = False
    for line in csv_reader:

        if set_attributes:
            for item in line:

                if item in sample_key_translation_dict:
                    key_name = sample_key_translation_dict[item]
                else:
                    key_name = item

                sample_dict[key_name] = ""

            break

        if "[BCLConvert_Data]" in line:
            set_attributes = True

    # fill in values for keys. line is currently below the [Data] headers
    for sample_number, line in enumerate(csv_reader):

        if len(sample_dict.keys()) != len(line):
            """
            if there is one more Data header compared to the length of
            data values then add an empty string to the end of data values
            i.e the Description will be empty string
            assumes the last Data header is going to be the Description
            this handles the case where the last trailing comma is trimmed

            Shaun said this issue may come up when a user edits the
            SampleSheet from within the MiSeq software
            """
            if len(sample_dict.keys()) - len(line) == 1:
                line.append("")
            else:
                raise exceptions.SampleSheetError(
                    ("Your sample sheet is malformed. Expected to find {} "
                     "columns in the [Data] section, but only found {} columns "
                     "for line {}.".format(len(sample_dict.keys()), len(line), line)),
                    sample_sheet_file
                )

        for index, key in enumerate(sample_dict.keys()):
            sample_dict[key] = line[index].strip()  # assumes values are never empty

        new_sample_dict = deepcopy(sample_dict)
        new_sample_name = new_sample_dict['sampleName']
        del new_sample_dict['sampleName']

        sample = model.Sample(
            sample_name=new_sample_name,
            description="",
            sample_number=sample_number + 1,
            samp_dict=new_sample_dict)
        sample_list.append(sample)

    return sample_list
예제 #7
0
def parse_metadata(sample_sheet_file):
    """
    Parse all lines under [Header], [Reads] and [Settings] in .csv file
    Lines under [Reads] are stored in a list with key name "readLengths"
    All other key names are translated according to the
        metadata_key_translation_dict

    arguments:
            sample_sheet_file -- path to SampleSheet.csv

    returns a dictionary containing the parsed key:pair values from .csv file
    """

    metadata_dict = {"readLengths": []}

    csv_reader = common.get_csv_reader(sample_sheet_file)

    metadata_key_translation_dict = {
        'Assay': 'assay',
        'Description': 'description',
        'Application': 'application',
        'Investigator Name': 'investigatorName',
        'Adapter': 'adapter',
        'AdapterRead2': 'adapterread2',
        'Workflow': 'workflow',
        'ReverseComplement': 'reversecomplement',
        'IEMFileVersion': 'iemfileversion',
        'Date': 'date',
        'Experiment Name': 'experimentName',
        'Chemistry': 'chemistry',
        'Project Name': 'projectName'
    }

    section = None

    for line in csv_reader:
        if "[Header]" in line or "[Settings]" in line:
            section = "header"
            continue
        elif "[Reads]" in line:
            section = "reads"
            continue
        elif "[Data]" in line:
            break
        elif line and line[0].startswith("["):
            section = "unknown"
            continue

        if not line or not line[0]:
            continue

        if not section:
            logging.debug(
                "Sample sheet is missing important sections: no sections were found"
            )
            raise exceptions.SampleSheetError(
                "Sample sheet is missing important sections: no sections were found.",
                sample_sheet_file)
        elif section == "header":
            try:
                key_name = metadata_key_translation_dict[line[0]]
                metadata_dict[key_name] = line[1]
            except KeyError:
                logging.debug("Unexpected key in header: [{}]".format(line[0]))
        elif section == "reads":
            metadata_dict["readLengths"].append(line[0])

    # currently sends just the larger readLengths
    if len(metadata_dict["readLengths"]) > 0:
        if len(metadata_dict["readLengths"]) == 2:
            metadata_dict["layoutType"] = "PAIRED_END"
        else:
            metadata_dict["layoutType"] = "SINGLE_END"
        metadata_dict["readLengths"] = max(metadata_dict["readLengths"])
    else:
        # this is an exceptional case, you can't have no read lengths!
        logging.debug(
            "The sample sheet is missing important sections: no [Reads] section found."
        )
        raise exceptions.SampleSheetError(
            "The sample sheet is missing important sections: no [Reads] section found.",
            sample_sheet_file)

    return metadata_dict
예제 #8
0
def _parse_samples(sample_sheet_file):
    """
    Parse all the lines under "[Data]" in .csv file

    arguments:
            sample_sheet_file -- path to SampleSheet.csv

    returns	a list containing Sample objects that have been created by a
        dictionary from the parsed out key:pair values from .csv file
    """

    logging.info("Reading data from sample sheet {}".format(sample_sheet_file))

    csv_reader = common.get_csv_reader(sample_sheet_file)
    # start with an ordered dictionary so that keys are ordered in the same
    # way that they are inserted.
    sample_dict = OrderedDict()
    sample_list = []

    sample_key_list = [
        'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'
    ]

    # initialize dictionary keys from first line (data headers/attributes)
    set_attributes = False
    for line in csv_reader:

        if set_attributes:
            for item in line:

                if item in sample_key_list:
                    key_name = item
                    sample_dict[key_name] = ""

            break

        if "[Data]" in line:
            set_attributes = True

    # fill in values for keys. line is currently below the [Data] headers
    for sample_number, line in enumerate(csv_reader):
        # if the line is empty (like a blank line at the end of the file) continue
        if not line:
            continue

        if len(sample_dict.keys()) != len(line):
            """
            if there is one more Data header compared to the length of
            data values then add an empty string to the end of data values
            i.e the File_Reverse will be empty string
            assumes the last Data header is going to be the File_Reverse
            this handles the case where the last trailing comma is trimmed when
            doing a single end run
            """
            if len(sample_dict.keys()) - len(line) == 1:
                line.append("")
            else:
                raise exceptions.SampleSheetError((
                    "Your sample sheet is malformed. Expected to find {} "
                    "columns in the [Data] section, but only found {} columns "
                    "for line {}.".format(len(sample_dict.keys()), len(line),
                                          line)), sample_sheet_file)

        for index, key in enumerate(sample_dict.keys()):
            value = line[index].strip()

            # Keys other than 'File_Reverse' cannot be empty
            if len(value) is 0:  # no value
                if key != 'File_Reverse':
                    raise exceptions.SampleSheetError((
                        "Your sample sheet is malformed. {} in the [Data] section cannot be empty."
                        "".format(key)), sample_sheet_file)

            sample_dict[key] = value

        sample_key_list = [
            'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'
        ]

        new_sample_dict = deepcopy(sample_dict)
        new_sample_name = new_sample_dict['Sample_Name']
        new_sample_project = new_sample_dict['Project_ID']
        new_sample_dict['sample_project'] = new_sample_project
        del new_sample_dict['Sample_Name']
        del new_sample_dict['Project_ID']

        sample = model.Sample(sample_name=new_sample_name,
                              description="",
                              sample_number=sample_number + 1,
                              samp_dict=new_sample_dict)

        sample_list.append(sample)

    return sample_list
예제 #9
0
def validate_sample_sheet(sample_sheet_file):
    """
    Checks if the given sample_sheet_file can be parsed
    Requires [Header] because it contains Workflow
    Requires [Data] for creating Sample objects and requires
        Sample_ID, Sample_Name, Sample_Project and Description table headers

    arguments:
            sample_sheet_file -- path to SampleSheet.csv

    returns ValidationResult object - stores list of string error messages
    """

    csv_reader = common.get_csv_reader(sample_sheet_file)

    v_res = model.ValidationResult()

    all_data_headers_found = False
    data_sect_found = False
    check_data_headers = False

    # status of required data headers
    found_data_headers = {
        "Sample_Name": False,
        "Project_ID": False,
        "File_Forward": False,
        "File_Reverse": False
    }

    for line in csv_reader:

        if "[Data]" in line:
            data_sect_found = True
            check_data_headers = True  # next line contains data headers

        elif check_data_headers:
            for data_header in found_data_headers.keys():
                if data_header in line:
                    found_data_headers[data_header] = True

            # if all required dataHeaders are found
            if all(found_data_headers.values()):
                all_data_headers_found = True

            check_data_headers = False

    if not all([data_sect_found, all_data_headers_found]):

        if data_sect_found is False:
            v_res.add_error(
                exceptions.SampleSheetError(
                    "[Data] section not found in SampleSheet",
                    sample_sheet_file))

        if all_data_headers_found is False:
            missing_str = ""
            for data_header in found_data_headers:
                if found_data_headers[data_header] is False:
                    missing_str = missing_str + data_header + ", "

            missing_str = missing_str[:-2]  # remove last ", "
            v_res.add_error(
                exceptions.SampleSheetError(
                    "Missing required data header(s): " + missing_str,
                    sample_sheet_file))

    return v_res