def check_mapping_data(mapping_data, headers, filename_column): """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names Also returns a dict of fasta file name: SampleID mapping_data: list of lines of data from mapping file headers: list of header strings filename_column: Column of metadata mapping file containing fasta filenames """ # First make sure there is a SampleID and filename_column present try: sample_id_ix = headers.index("SampleID") except ValueError: raise ValueError( "SampleID column not found in mapping file, please " + "check mapping file with validate_mapping_file.py" ) try: filename_col_ix = headers.index(filename_column) except ValueError: raise ValueError("Specified column %s not found in mapping file." % filename_column) valid_mimarks = letters + digits + "." fasta_name_to_sample_id = {} fasta_names = [] sample_ids = [] for line in mapping_data: try: fasta_name_to_sample_id[basename(line[filename_col_ix].strip())] = line[sample_id_ix] except IndexError: raise IndexError("Missing filename column data in line %s " % line) for curr_char in line[sample_id_ix]: if curr_char not in valid_mimarks: raise ValueError( "Found invalid character in line: %s\n" % line + "SampleIDs must be alphanumeric and . characters " + "only" ) sample_ids.append(line[sample_id_ix].strip()) fasta_names.append(line[filename_col_ix].strip()) fasta_name_dups = duplicates_indices(fasta_names) if fasta_name_dups: raise ValueError( "Found duplicate fasta names: %s" % "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()]) ) sample_id_dups = duplicates_indices(sample_ids) if sample_id_dups: raise ValueError( "Found duplicate SampleID names: %s" % "\t".join([sample_id for sample_id in sample_id_dups.keys()]) ) return fasta_name_to_sample_id
def check_mapping_data(mapping_data, headers, filename_column): """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names Also returns a dict of fasta file name: SampleID mapping_data: list of lines of data from mapping file headers: list of header strings filename_column: Column of metadata mapping file containing fasta filenames """ # First make sure there is a SampleID and filename_column present try: sample_id_ix = headers.index("SampleID") except ValueError: raise ValueError("SampleID column not found in mapping file, please " + "check mapping file with validate_mapping_file.py") try: filename_col_ix = headers.index(filename_column) except ValueError: raise ValueError("Specified column %s not found in mapping file." % filename_column) valid_mimarks = letters + digits + "." fasta_name_to_sample_id = {} fasta_names = [] sample_ids = [] for line in mapping_data: try: fasta_name_to_sample_id[basename(line[filename_col_ix].strip())] =\ line[sample_id_ix] except IndexError: raise IndexError("Missing filename column data in line %s " % line) for curr_char in line[sample_id_ix]: if curr_char not in valid_mimarks: raise ValueError( "Found invalid character in line: %s\n" % line + "SampleIDs must be alphanumeric and . characters " + "only") sample_ids.append(line[sample_id_ix].strip()) fasta_names.append(line[filename_col_ix].strip()) fasta_name_dups = duplicates_indices(fasta_names) if fasta_name_dups: raise ValueError( "Found duplicate fasta names: %s" % "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()])) sample_id_dups = duplicates_indices(sample_ids) if sample_id_dups: raise ValueError( "Found duplicate SampleID names: %s" % "\t".join([sample_id for sample_id in sample_id_dups.keys()])) return fasta_name_to_sample_id
def check_fixed_len_bcs_dups(header, mapping_data, errors): """ Checks barcodes of same length for duplicates, adds to errors if found header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors """ header_field_to_check = "BarcodeSequence" # Skip if no field BarcodeSequence try: check_ix = header.index(header_field_to_check) except ValueError: return errors barcodes = [] correction = 1 for curr_data in mapping_data: barcodes.append(upper(curr_data[check_ix])) dups = duplicates_indices(barcodes) for curr_dup in dups: for curr_loc in dups[curr_dup]: errors.append('Duplicate barcode %s found.\t%d,%d' % (curr_dup, curr_loc + correction, check_ix)) return errors
def check_fixed_len_bcs_dups(header, mapping_data, errors): """ Checks barcodes of same length for duplicates, adds to errors if found header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors """ header_field_to_check = "BarcodeSequence" # Skip if no field BarcodeSequence try: check_ix = header.index(header_field_to_check) except ValueError: return errors barcodes = [] correction = 1 for curr_data in mapping_data: barcodes.append(curr_data[check_ix]) dups = duplicates_indices(barcodes) for curr_dup in dups: for curr_loc in dups[curr_dup]: errors.append('Duplicate barcode %s found.\t%d,%d' %\ (curr_dup, curr_loc + correction, check_ix)) return errors
def check_variable_len_bcs_dups(header, mapping_data, errors): """ Checks variable length barcodes plus sections of primers for dups header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors """ header_field_to_check = "BarcodeSequence" # Skip if no field BarcodeSequence try: check_ix = header.index(header_field_to_check) except ValueError: return errors linker_primer_field = "LinkerPrimerSequence" try: linker_primer_ix = header.index(linker_primer_field) no_primers = False except ValueError: no_primers = True barcodes = [] bc_lens = [] correction = 1 for curr_data in mapping_data: barcodes.append(upper(curr_data[check_ix])) bc_lens.append(len(curr_data[check_ix])) # Get max length of barcodes to determine how many primer bases to slice barcode_max_len = max(bc_lens) # Have to do second pass to append correct number of nucleotides to # check for duplicates between barcodes and primer sequences bcs_added_nts = [] for curr_data in mapping_data: if no_primers: bcs_added_nts.append(upper(curr_data[check_ix])) else: adjusted_len = barcode_max_len - len(curr_data[check_ix]) bcs_added_nts.append(upper(curr_data[check_ix] + curr_data[linker_primer_ix][0:adjusted_len])) dups = duplicates_indices(bcs_added_nts) for curr_dup in dups: for curr_loc in dups[curr_dup]: if no_primers: errors.append("Duplicate barcode %s found.\t%d,%d" % (curr_dup, curr_loc + correction, check_ix)) else: errors.append( "Duplicate barcode and primer fragment sequence " + "%s found.\t%d,%d" % (curr_dup, curr_loc + correction, check_ix) ) return errors
def check_variable_len_bcs_dups(header, mapping_data, errors): """ Checks variable length barcodes plus sections of primers for dups header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors """ header_field_to_check = "BarcodeSequence" # Skip if no field BarcodeSequence try: check_ix = header.index(header_field_to_check) except ValueError: return errors linker_primer_field = "LinkerPrimerSequence" try: linker_primer_ix = header.index(linker_primer_field) no_primers = False except ValueError: no_primers = True barcodes = [] bc_lens = [] correction = 1 for curr_data in mapping_data: barcodes.append(upper(curr_data[check_ix])) bc_lens.append(len(curr_data[check_ix])) # Get max length of barcodes to determine how many primer bases to slice barcode_max_len = max(bc_lens) # Have to do second pass to append correct number of nucleotides to # check for duplicates between barcodes and primer sequences bcs_added_nts = [] for curr_data in mapping_data: if no_primers: bcs_added_nts.append(upper(curr_data[check_ix])) else: adjusted_len = barcode_max_len - len(curr_data[check_ix]) bcs_added_nts.append(upper(curr_data[check_ix] +\ curr_data[linker_primer_ix][0:adjusted_len])) dups = duplicates_indices(bcs_added_nts) for curr_dup in dups: for curr_loc in dups[curr_dup]: if no_primers: errors.append('Duplicate barcode %s found.\t%d,%d' %\ (curr_dup, curr_loc + correction, check_ix)) else: errors.append('Duplicate barcode and primer fragment sequence '+\ '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, check_ix)) return errors
def check_sampleid_duplicates(header, mapping_data, errors): """ Flags duplicate, missing SampleIDs as errors header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors """ sample_id_field = "SampleID" correction = 1 try: sample_id_ix = header.index(sample_id_field) except ValueError: # Skip out at this point, header check will have error for missing # field return errors sample_ids = [] # Need to save locations of missing IDs so they aren't flagged twice missing_sample_ids = [] for curr_data in range(len(mapping_data)): if len(mapping_data[curr_data][sample_id_ix]) == 0: errors.append('Missing SampleID.\t%d,%d' % (curr_data + correction, sample_id_ix)) missing_sample_ids.append(curr_data + correction) sample_ids.append(mapping_data[curr_data][sample_id_ix]) dups = duplicates_indices(sample_ids) for curr_dup in dups: for curr_loc in dups[curr_dup]: if (curr_loc + correction) not in missing_sample_ids: errors.append('Duplicate SampleID %s found.\t%d,%d' % (curr_dup, curr_loc + correction, sample_id_ix)) return errors
def check_sampleid_duplicates(header, mapping_data, errors): """ Flags duplicate, missing SampleIDs as errors header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors """ sample_id_field = "SampleID" correction = 1 try: sample_id_ix = header.index(sample_id_field) except ValueError: # Skip out at this point, header check will have error for missing # field return errors sample_ids = [] # Need to save locations of missing IDs so they aren't flagged twice missing_sample_ids = [] for curr_data in range(len(mapping_data)): if len(mapping_data[curr_data][sample_id_ix]) == 0: errors.append('Missing SampleID.\t%d,%d' %\ (curr_data + correction, sample_id_ix)) missing_sample_ids.append(curr_data + correction) sample_ids.append(mapping_data[curr_data][sample_id_ix]) dups = duplicates_indices(sample_ids) for curr_dup in dups: for curr_loc in dups[curr_dup]: if (curr_loc + correction) not in missing_sample_ids: errors.append('Duplicate SampleID %s found.\t%d,%d' %\ (curr_dup, curr_loc + correction, sample_id_ix)) return errors
def check_mapping_data(mapping_data): """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names Also returns a dict of fasta file name: SampleID mapping_data: list of lines of data from mapping file """ valid_mimarks = letters + digits + "." fasta_name_to_sample_id = {} fasta_names = [] for line in mapping_data: curr_line = line.strip().split('\t') if not curr_line or line.startswith("#"): continue try: fasta_name_to_sample_id[basename(curr_line[1].strip())] =\ curr_line[0] except IndexError: raise IndexError,("Found non-tab separated line in mapping "+\ "data. Offending line is: %s" % line) for curr_char in curr_line[0]: if curr_char not in valid_mimarks: raise ValueError,("Found invalid character in line: %s\n" %\ line + "SampleIDs must be alphanumeric and . characters "+\ "only") fasta_names.append(curr_line[1].strip()) fasta_name_dups = duplicates_indices(fasta_names) if fasta_name_dups: raise ValueError,("Found duplicate fasta names: %s" %\ "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()])) return fasta_name_to_sample_id
def check_added_demultiplex_dups(header, mapping_data, errors, has_barcodes=True, added_demultiplex_field=None): """ Checks that all barcodes and added demultiplex fields are unique header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors has_barcodes: True if barcode fields are to be used. added_demultiplex_field: If specified, references a field in the mapping file to use for demultiplexing. These are to be read from fasta labels during the actual demultiplexing step. All combinations of barcodes, primers, and the added_demultiplex_field must be unique. """ # Treat as variable length to test combinations of barcodes and the # added demultiplex field (should return the same result for the barcode # component) correction = 1 header_field_to_check = "BarcodeSequence" bc_found = False # Skip if no field BarcodeSequence if has_barcodes: try: bc_ix = header.index(header_field_to_check) bc_found = True except ValueError: pass linker_primer_field = "LinkerPrimerSequence" try: linker_primer_ix = header.index(linker_primer_field) no_primers = False except ValueError: no_primers = True try: added_demultiplex_ix = header.index(added_demultiplex_field) except ValueError: # Skip out at this point, header check will have error for missing # field return errors barcodes = [] bc_lens = [] bcs_added_field = [] if has_barcodes and bc_found: for curr_data in mapping_data: barcodes.append(upper(curr_data[bc_ix])) bc_lens.append(len(curr_data[bc_ix])) # Get max length of barcodes to determine how many primer bases to # slice barcode_max_len = max(bc_lens) # Have to do second pass to append correct number of nucleotides to # check for duplicates between barcodes and primer sequences for curr_data in mapping_data: if no_primers: bcs_added_field.append(curr_data[bc_ix] + curr_data[added_demultiplex_ix]) else: adjusted_len = barcode_max_len - len(curr_data[bc_ix]) bcs_added_field.append(curr_data[bc_ix] + curr_data[linker_primer_ix][0:adjusted_len] + curr_data[added_demultiplex_ix]) else: for curr_data in mapping_data: bcs_added_field.append(curr_data[added_demultiplex_ix]) dups = duplicates_indices(bcs_added_field) for curr_dup in dups: if has_barcodes and bc_found: for curr_loc in dups[curr_dup]: errors.append('Duplicate barcode and added demultiplex field ' + '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, bc_ix)) else: for curr_loc in dups[curr_dup]: errors.append('Duplicate added demultiplex field ' + '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, added_demultiplex_ix)) return errors
def check_added_demultiplex_dups(header, mapping_data, errors, has_barcodes=True, added_demultiplex_field=None): """ Checks that all barcodes and added demultiplex fields are unique header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors has_barcodes: True if barcode fields are to be used. added_demultiplex_field: If specified, references a field in the mapping file to use for demultiplexing. These are to be read from fasta labels during the actual demultiplexing step. All combinations of barcodes, primers, and the added_demultiplex_field must be unique. """ # Treat as variable length to test combinations of barcodes and the # added demultiplex field (should return the same result for the barcode # component) correction = 1 header_field_to_check = "BarcodeSequence" bc_found = False # Skip if no field BarcodeSequence if has_barcodes: try: bc_ix = header.index(header_field_to_check) bc_found = True except ValueError: pass linker_primer_field = "LinkerPrimerSequence" try: linker_primer_ix = header.index(linker_primer_field) no_primers = False except ValueError: no_primers = True try: added_demultiplex_ix = header.index(added_demultiplex_field) except ValueError: # Skip out at this point, header check will have error for missing # field return errors barcodes = [] bc_lens = [] bcs_added_field = [] if has_barcodes and bc_found: for curr_data in mapping_data: barcodes.append(curr_data[bc_ix]) bc_lens.append(len(curr_data[bc_ix])) # Get max length of barcodes to determine how many primer bases to slice barcode_max_len = max(bc_lens) # Have to do second pass to append correct number of nucleotides to # check for duplicates between barcodes and primer sequences for curr_data in mapping_data: if no_primers: bcs_added_field.append(curr_data[bc_ix] +\ curr_data[added_demultiplex_ix]) else: adjusted_len = barcode_max_len - len(curr_data[bc_ix]) bcs_added_field.append(curr_data[bc_ix] +\ curr_data[linker_primer_ix][0:adjusted_len] +\ curr_data[added_demultiplex_ix]) else: for curr_data in mapping_data: bcs_added_field.append(curr_data[added_demultiplex_ix]) dups = duplicates_indices(bcs_added_field) for curr_dup in dups: if has_barcodes and bc_found: for curr_loc in dups[curr_dup]: errors.append('Duplicate barcode and added demultiplex field '+\ '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, bc_ix)) else: for curr_loc in dups[curr_dup]: errors.append('Duplicate added demultiplex field '+\ '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, added_demultiplex_ix)) return errors