def test_load_config_id_name_attr(self): row = self.csv_feat_row_dict.copy() row['Name'] = 'ID' csv = self.csv("features.csv", [row.values()]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' _, gff_files = counter.load_config(dummy_file, False) # Expect {file: [empty Name Attribute list]} from_dummy = from_here(dummy_file, row['Source']) expected = defaultdict(list, zip([from_dummy], [[]])) self.assertEqual(gff_files, expected)
def test_load_samples_single_cmd(self): dummy_file = '/dev/null' inp_file = "test.fastq" exp_file = from_here(dummy_file, "test_aligned_seqs.sam") row = {'File': inp_file, 'Group': "test_group", 'Rep': "0"} csv = self.csv("samples.csv", [row.values()]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): inputs_step = counter.load_samples(dummy_file, is_pipeline=False) expected_lib_name = f"{row['Group']}_rep_{row['Rep']}" expected_result = [{'File': exp_file, 'Name': expected_lib_name}] self.assertEqual(inputs_step, expected_result)
def test_load_config_duplicate_rules(self): # Features CSV with two duplicate rules/rows row = self.csv_feat_row_dict.values() csv = self.csv("features.csv", [row, row]) # Duplicate rows with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_filename = '/dev/null' ruleset, gff_files = counter.load_config(dummy_filename, False) r = self.csv_feat_row_dict expected_gff_file = from_here(dummy_filename, r['Source']) expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[r['Name']]])) expected_ruleset = self.feat_rule self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset)
def test_load_config_single_cmd(self): # Features CSV with a single rule/row row = self.csv_feat_row_dict.values() csv = self.csv("features.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=False) r = self.csv_feat_row_dict expected_gff_file = from_here(dummy_file, r['Source']) expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[r['Name']]])) expected_ruleset = self.feat_rule self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset)
def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[str, list]]: """Parses the Features Sheet to provide inputs to FeatureSelector and build_reference_tables Args: features_csv: a csv file which defines feature sources and selection rules is_pipeline: helps locate GFF files defined in the Features Sheet. If true, GFF files are assumed to reside in the working directory. Returns: rules: a list of dictionaries, each representing a parsed row from input. Note that these are just rule definitions which FeatureSelector will further digest to produce its rules table. gff_files: a dict of GFF files and associated Name Attribute preferences """ rules, gff_files = list(), defaultdict(list) for row in CSVReader(features_csv, "Features Sheet").rows(): rule = { col: row[col] for col in ["Strand", "Hierarchy", "nt5end", "Length", "Strict"] } rule['nt5end'] = rule['nt5end'].upper().translate( {ord('U'): 'T'}) # Convert RNA base to cDNA base rule['Identity'] = (row['Key'], row['Value']) # Create identity tuple rule['Hierarchy'] = int( rule['Hierarchy']) # Convert hierarchy to number rule['Strict'] = rule['Strict'].lower( ) # Built later in ReferenceTables gff = os.path.basename(row['Source']) if is_pipeline else from_here( features_csv, row['Source']) # Duplicate Name Attributes and rule entries are not allowed if row['Name'] not in ["ID", *gff_files[gff]]: gff_files[gff].append(row['Name']) if rule not in rules: rules.append(rule) return rules, gff_files
def get_library_filename(csv_row_file: str, samples_csv: str) -> str: """The input samples.csv may contain either fastq or sam files""" file_ext = os.path.splitext(csv_row_file)[1].lower() # If the sample file has a fastq(.gz) extension, infer the name of its pipeline-produced .sam file if file_ext in [".fastq", ".gz"]: # Fix relative paths to be relative to sample_csv's path, rather than relative to cwd csv_row_file = os.path.basename( csv_row_file) if is_pipeline else from_here( samples_csv, csv_row_file) csv_row_file = os.path.splitext( csv_row_file)[0] + "_aligned_seqs.sam" elif file_ext == ".sam": if not os.path.isabs(csv_row_file): raise ValueError( "The following file must be expressed as an absolute path:\n%s" % (csv_row_file, )) else: raise ValueError( "The filenames defined in your Samples Sheet must have a .fastq(.gz) or .sam extension.\n" "The following filename contained neither:\n%s" % (csv_row_file, )) return csv_row_file