def process_mapping_file(map_f,
                         barcode_len,
                         barcode_type,
                         BARCODE_COLUMN,
                         REVERSE_PRIMER_COLUMN):
    """Ensures that sample IDs and barcodes are unique, that barcodes are
    all the same length, and that primers are present. Ensures barcodes
    and primers only contain valid characters.
    Parameters
    ----------
    map_f: file
        metadata mapping file
    barcode_type: string
        barcode type, can be either integer or golay_12
    barcode_len: int
        barcode length
    barcode_column: string
        header of barcode column
    reverse_primer_column: string
        header of the reverse primer column
    Returns
    ----------
    bc_to_sid: dict
    bc_to_fwd_primers: dict
    bc_to_rev_primers: dict
    """

    _, _, bc_to_sid, _, _, bc_to_fwd_primers, _ = check_map(map_f, False)
    map_f.seek(0)

    metadata_map = parse_mapping_file_to_dict(map_f)[0]
    bc_to_rev_primers = {}
    for sid, md in metadata_map.items():
        if REVERSE_PRIMER_COLUMN in md:
            bc_to_rev_primers[
                md[BARCODE_COLUMN]] = expand_degeneracies(
                md[REVERSE_PRIMER_COLUMN].upper().split(','))
        else:
            raise Exception(
                "The %s column does not exist in the "
                "mapping file. %s is required." %
                (REVERSE_PRIMER_COLUMN,
                 REVERSE_PRIMER_COLUMN))

    check_barcodes(bc_to_sid, barcode_len, barcode_type)

    return (bc_to_sid,
            bc_to_fwd_primers,
            bc_to_rev_primers)
Пример #2
0
def get_mapping_details(mapping_fp):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file
    
    mapping_fp: filepath to mapping file
    """
    
    mapping_f = open(mapping_fp, "U")
    
    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)
        
    mapping_f.close()
    
    # Errors means problems with SampleIDs or headers
    if errors:
        raise ValueError,('Error in mapping file, please validate '+\
         'mapping file with check_id_map.py')
         
    # create dict of dicts with SampleID:{each header:mapping data}
    
    id_map = {}
    
    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}
        
    
    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]
         
    sample_ids = id_map.keys()
    
    barcode_seqs = []
    raw_linkerprimer_seqs = []
    
    for curr_id in id_map:
        barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        raw_linkerprimer_seqs.append(id_map[curr_id]['LinkerPrimerSequence'])
    
    # remove duplicates    
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)
    
    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)
    
    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
Пример #3
0
def get_mapping_details(mapping_fp):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file
    
    mapping_fp: filepath to mapping file
    """

    mapping_f = open(mapping_fp, "U")

    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)

    mapping_f.close()

    # Errors means problems with SampleIDs or headers
    if errors:
        raise ValueError,('Error in mapping file, please validate '+\
         'mapping file with check_id_map.py')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    sample_ids = id_map.keys()

    barcode_seqs = []
    raw_linkerprimer_seqs = []

    for curr_id in id_map:
        barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        raw_linkerprimer_seqs.append(id_map[curr_id]['LinkerPrimerSequence'])

    # remove duplicates
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)

    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)

    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_mapping_details(mapping_fp,
                        suppress_barcode_checks=False,
                        suppress_primer_checks=False):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file

    mapping_fp: filepath to mapping file
    suppress_barcode_checks=If True, will skip getting barcodes from mapping
     file and searching for these in sequences.
    suppress_primer_checks=If True, will skip getting primers from mapping
     file and searching for these in sequences
    """

    mapping_f = open(mapping_fp, "U")

    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)

    mapping_f.close()

    # Should raise errors for barcodes or primers unless suppressed, and
    # should raise errors for headers or duplicate SampleIDs in any case.
    loc_bcs = ",1"
    loc_primers = ",2"
    if errors:
        for curr_error in errors:
            # Halt when header has error
            if curr_error.startswith("Found header field"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_bcs):
                # Halt for barcode errors unless suppressed
                if suppress_barcode_checks:
                    continue
                else:
                    raise ValueError(
                        'Error in mapping file, please validate '
                        'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_primers):
                # Halt for primer errors unless suppressed
                if suppress_primer_checks:
                    continue
                else:
                    raise ValueError(
                        'Error in mapping file, please validate '
                        'mapping file with validate_mapping_file.py')
            # Raise error on duplicate sample IDs
            elif curr_error.startswith("Duplicate SampleID"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    sample_ids = id_map.keys()

    barcode_seqs = []
    raw_linkerprimer_seqs = []

    for curr_id in id_map:
        if not suppress_barcode_checks:
            barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        if not suppress_primer_checks:
            raw_linkerprimer_seqs.append(
                id_map[curr_id]['LinkerPrimerSequence'])

    # remove duplicates
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)

    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)

    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_mapping_details(mapping_fp,
                        suppress_barcode_checks=False,
                        suppress_primer_checks=False):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file

    mapping_fp: filepath to mapping file
    suppress_barcode_checks=If True, will skip getting barcodes from mapping
     file and searching for these in sequences.
    suppress_primer_checks=If True, will skip getting primers from mapping
     file and searching for these in sequences
    """

    mapping_f = open(mapping_fp, "U")

    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)

    mapping_f.close()

    # Should raise errors for barcodes or primers unless suppressed, and
    # should raise errors for headers or duplicate SampleIDs in any case.
    loc_bcs = ",1"
    loc_primers = ",2"
    if errors:
        for curr_error in errors:
            # Halt when header has error
            if curr_error.startswith("Found header field"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_bcs):
                # Halt for barcode errors unless suppressed
                if suppress_barcode_checks:
                    continue
                else:
                    raise ValueError('Error in mapping file, please validate '
                                     'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_primers):
                # Halt for primer errors unless suppressed
                if suppress_primer_checks:
                    continue
                else:
                    raise ValueError('Error in mapping file, please validate '
                                     'mapping file with validate_mapping_file.py')
            # Raise error on duplicate sample IDs
            elif curr_error.startswith("Duplicate SampleID"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    sample_ids = id_map.keys()

    barcode_seqs = []
    raw_linkerprimer_seqs = []

    for curr_id in id_map:
        if not suppress_barcode_checks:
            barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        if not suppress_primer_checks:
            raw_linkerprimer_seqs.append(
                id_map[curr_id]['LinkerPrimerSequence'])

    # remove duplicates
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)

    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)

    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
Пример #6
0
 def test_expand_degeneracies(self):
     """generate_possibilities should make possible strings"""
     self.assertEqual(expand_degeneracies('ACG'), ['ACG'])
     self.assertEqual(expand_degeneracies('RGY'), 
         ['AGT', 'AGC', 'GGT', 'GGC'])