Exemplo n.º 1
0
    def test_load_config_id_name_attr(self):
        row = self.csv_feat_row_dict.copy()
        row['Name'] = 'ID'
        csv = self.csv("features.csv", [row.values()])

        with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
            dummy_file = '/dev/null'
            _, gff_files = counter.load_config(dummy_file, False)

        # Expect {file: [empty Name Attribute list]}
        from_dummy = from_here(dummy_file, row['Source'])
        expected = defaultdict(list, zip([from_dummy], [[]]))
        self.assertEqual(gff_files, expected)
Exemplo n.º 2
0
    def test_load_samples_single_cmd(self):
        dummy_file = '/dev/null'
        inp_file = "test.fastq"
        exp_file = from_here(dummy_file, "test_aligned_seqs.sam")

        row = {'File': inp_file, 'Group': "test_group", 'Rep': "0"}
        csv = self.csv("samples.csv", [row.values()])

        with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
            inputs_step = counter.load_samples(dummy_file, is_pipeline=False)

        expected_lib_name = f"{row['Group']}_rep_{row['Rep']}"
        expected_result = [{'File': exp_file, 'Name': expected_lib_name}]
        self.assertEqual(inputs_step, expected_result)
Exemplo n.º 3
0
    def test_load_config_duplicate_rules(self):
        # Features CSV with two duplicate rules/rows
        row = self.csv_feat_row_dict.values()
        csv = self.csv("features.csv", [row, row])  # Duplicate rows

        with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
            dummy_filename = '/dev/null'
            ruleset, gff_files = counter.load_config(dummy_filename, False)

        r = self.csv_feat_row_dict
        expected_gff_file = from_here(dummy_filename, r['Source'])
        expected_gff_ret = defaultdict(list,
                                       zip([expected_gff_file], [[r['Name']]]))
        expected_ruleset = self.feat_rule

        self.assertEqual(gff_files, expected_gff_ret)
        self.assertEqual(ruleset, expected_ruleset)
Exemplo n.º 4
0
    def test_load_config_single_cmd(self):
        # Features CSV with a single rule/row
        row = self.csv_feat_row_dict.values()
        csv = self.csv("features.csv", [row])

        with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
            dummy_file = '/dev/null'
            ruleset, gff_files = counter.load_config(dummy_file,
                                                     is_pipeline=False)

        r = self.csv_feat_row_dict
        expected_gff_file = from_here(dummy_file, r['Source'])
        expected_gff_ret = defaultdict(list,
                                       zip([expected_gff_file], [[r['Name']]]))
        expected_ruleset = self.feat_rule

        self.assertEqual(gff_files, expected_gff_ret)
        self.assertEqual(ruleset, expected_ruleset)
Exemplo n.º 5
0
def load_config(features_csv: str,
                is_pipeline: bool) -> Tuple[List[dict], Dict[str, list]]:
    """Parses the Features Sheet to provide inputs to FeatureSelector and build_reference_tables

    Args:
        features_csv: a csv file which defines feature sources and selection rules
        is_pipeline: helps locate GFF files defined in the Features Sheet. If true,
            GFF files are assumed to reside in the working directory.

    Returns:
        rules: a list of dictionaries, each representing a parsed row from input.
            Note that these are just rule definitions which FeatureSelector will
            further digest to produce its rules table.
        gff_files: a dict of GFF files and associated Name Attribute preferences
    """

    rules, gff_files = list(), defaultdict(list)

    for row in CSVReader(features_csv, "Features Sheet").rows():
        rule = {
            col: row[col]
            for col in ["Strand", "Hierarchy", "nt5end", "Length", "Strict"]
        }
        rule['nt5end'] = rule['nt5end'].upper().translate(
            {ord('U'): 'T'})  # Convert RNA base to cDNA base
        rule['Identity'] = (row['Key'], row['Value'])  # Create identity tuple
        rule['Hierarchy'] = int(
            rule['Hierarchy'])  # Convert hierarchy to number
        rule['Strict'] = rule['Strict'].lower(
        )  # Built later in ReferenceTables

        gff = os.path.basename(row['Source']) if is_pipeline else from_here(
            features_csv, row['Source'])

        # Duplicate Name Attributes and rule entries are not allowed
        if row['Name'] not in ["ID", *gff_files[gff]]:
            gff_files[gff].append(row['Name'])
        if rule not in rules: rules.append(rule)

    return rules, gff_files
Exemplo n.º 6
0
    def get_library_filename(csv_row_file: str, samples_csv: str) -> str:
        """The input samples.csv may contain either fastq or sam files"""

        file_ext = os.path.splitext(csv_row_file)[1].lower()

        # If the sample file has a fastq(.gz) extension, infer the name of its pipeline-produced .sam file
        if file_ext in [".fastq", ".gz"]:
            # Fix relative paths to be relative to sample_csv's path, rather than relative to cwd
            csv_row_file = os.path.basename(
                csv_row_file) if is_pipeline else from_here(
                    samples_csv, csv_row_file)
            csv_row_file = os.path.splitext(
                csv_row_file)[0] + "_aligned_seqs.sam"
        elif file_ext == ".sam":
            if not os.path.isabs(csv_row_file):
                raise ValueError(
                    "The following file must be expressed as an absolute path:\n%s"
                    % (csv_row_file, ))
        else:
            raise ValueError(
                "The filenames defined in your Samples Sheet must have a .fastq(.gz) or .sam extension.\n"
                "The following filename contained neither:\n%s" %
                (csv_row_file, ))
        return csv_row_file