Пример #1
0
def seq_to_array(seq, k=1, overlap=True):
    """Converts a DNA sequence into a Numpy vector. If :math:`k>1`, then it
    creates a vector of the :math:`k`-mers.

    Args:
        seq (~skbio.sequence.DNA or str): The sequence to convert.
        k (int, optional): The :math:`k` value to use. Defaults to 1.
        overlap (bool, optional): Whether the :math:`k`-mers should overlap.
            Defaults to True.

    Returns:
        ~numpy.ndarray: An array representing the sequence.

    Examples:
        .. runblock:: pycon

            >>> from krtd import seq_to_array # ignore
            >>> seq_to_array("ATGC")
            >>> seq_to_array("ATGC", k=2)
            >>> seq_to_array("ATGC", k=2, overlap=False)

    """
    # convert to DNA object
    if not isinstance(seq, DNA):
        seq = DNA(seq)
    return np.fromiter(
        (str(k_mer) for k_mer in seq.iter_kmers(k=k, overlap=overlap)),
        "<U" + str(k))
Пример #2
0
def process_barcode_single_end_data(read1_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    bc1_len=6,
                                    rev_comp_bc1=False):
    """ Processes, writes single-end barcode data, parsed sequence

    read1_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of data
    rev_comp_bc1: reverse complement barcode before writing.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    bc_read = read1_data[sequence_index][:bc1_len]
    bc_qual = read1_data[quality_index][:bc1_len]
    if rev_comp_bc1:
        bc_read = str(DNA(bc_read).rc())
        bc_qual = bc_qual[::-1]

    bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
    output_bc_fastq.write(bc_lines)
    seq_lines = format_fastq_record(read1_data[header_index],
                                    read1_data[sequence_index][bc1_len:],
                                    read1_data[quality_index][bc1_len:])
    output_fastq1.write(seq_lines)

    return
Пример #3
0
def parse_illumina_line(l, barcode_length, rev_comp_barcode,
                        barcode_in_sequence=False):
    """Parses a single line of Illumina data
    """
    fields = l.strip().split(':')

    y_position_subfields = fields[4].split('#')
    y_position = int(y_position_subfields[0])
    sequence = fields[5]
    qual_string = fields[6]

    if barcode_in_sequence:
        barcode = sequence[:barcode_length]
        sequence = sequence[barcode_length:]
        qual_string = qual_string[barcode_length:]
    else:
        barcode = y_position_subfields[1][:barcode_length]

    if rev_comp_barcode:
        barcode = str(DNA(barcode).rc())

    result = {
        'Full description': ':'.join(fields[:5]),
        'Machine Name': fields[0],
        'Channel Number': int(fields[1]),
        'Tile Number': int(fields[2]),
        'X Position': int(fields[3]),
        'Y Position': y_position,
        'Barcode': barcode,
        'Full Y Position Field': fields[4],
        'Sequence': sequence,
        'Quality Score': qual_string}

    return result
Пример #4
0
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc):
    """
    """
    for seq_id, seq in parse_fasta(fasta_lines):
        seq_id = seq_desc_mapper(seq_id)
        seq = str(DNA(seq.upper()).rc())
        yield seq_id, seq
    return
Пример #5
0
def _construct(record, constructor=None, **kwargs):
    seq, md = record
    if constructor is None:
        constructor = Sequence
    if constructor == RNA:
        return DNA(seq, metadata=md, **kwargs).transcribe()
    else:
        return constructor(seq, metadata=md, **kwargs)
Пример #6
0
def identity_coverage(dna_query, protein_query, dna_target, protein_target):
    """
        def category(query, dna_seq, protein_seq):
       if identity_coverage(query, dna_seq) >= (0.95, 0.95): return "EXACT"
       if identity_coverage(query, protein_seq) >= (0.8, 0.8): return "SIMILAR"
       if identity_coverage(query, protein_seq) >= (0.5, 0.5): return "MATCH"
       return "NO MATCH"
    """
    if dna_query != '':
        try:
            sw_dna = skbio.alignment.local_pairwise_align_ssw(
                DNA(dna_query), DNA(dna_target))
        except:
            sw_dna = skbio.alignment.local_pairwise_align_nucleotide(
                DNA(dna_query), DNA(dna_target))
        dna_identity, align_length = extract_sw(sw_dna)
        dna_coverage = align_length / min(len(dna_query), len(dna_target))
        if dna_identity >= 0.95 and dna_coverage >= 0.95:
            return 'EXACT'
    try:
        sw_protein = skbio.alignment.local_pairwise_align_ssw(
            Protein(protein_query),
            Protein(protein_target),
            substitution_matrix=blosum62,
            gap_open_penalty=11,
            gap_extend_penalty=1)
    except:
        sw_protein = skbio.alignment.local_pairwise_align_protein(
            Protein(protein_query),
            Protein(protein_target),
            substitution_matrix=blosum62,
            gap_open_penalty=11,
            gap_extend_penalty=1)
    protein_identity, align_length = extract_sw(sw_protein)
    protein_coverage = align_length / min(len(protein_query),
                                          len(protein_target))
    if protein_identity >= 0.8 and protein_coverage >= 0.8:
        return 'SIMILAR'

    if protein_identity >= 0.5 and protein_coverage >= 0.5:
        return 'MATCH'

    return 'NO MATCH'
Пример #7
0
def find_sgRNAs():
    '''
    Search for PAMs and their positions of their corresponding sgRNAs
    Return the position just before the sgRNA, along with the strand direction.
    Save the sgRNAs along with their positions and included exons
    PAMs on the reverse strand are described similar:
    Example:
    'tcg^acgtataaatatatcgatatNGG' would result in a tuple (3, '+')
    'atttgCCNgateagctcgatctattata^tgat' would result in a tuple (8, '-')
    '''

    with open(EXON_INTERVAL_TREES_FILE, 'rb') as f:
        exon_interval_trees = pickle.load(f)
    sgRNA_count = 0
    sgRNA_dict = {}
    sgRNA_collection.drop()
    logging.info('Old sgRNA collection deleted')
    for chromosome in CHROMOSOMES:
        logging.info('find pams in {}'.format(chromosome))
        with open(CHROMOSOME_RAW_FILE.format(chromosome)) as chr_file:
            chr_sequence = DNA(chr_file.read().upper())
            for strand in ['+', '-']:
                # for the reverse strand, inversecomplement the chromosome
                if strand == '-':
                    chr_sequence = chr_sequence.reverse_complement()
                # 20 Protospacer + 1 PAM-nucleotide, find overlapping sequences
                for guide_position in chr_sequence. \
                        find_with_regex('(?=([ACTG]{20})[ACTG]GG)'):

                    process_sgRNA(guide_position, chr_sequence, chromosome,
                                  strand, exon_interval_trees)
                    try:
                        sgRNA_dict[kmer_to_int(
                            chr_sequence[guide_position])] += 1
                    except KeyError:
                        sgRNA_dict[kmer_to_int(
                            chr_sequence[guide_position])] = 1
                    sgRNA_count += 1

    logging.info('Found {} sgRNA sites'.format(sgRNA_count))
    logging.info('Found {} distinct protospacers'.format(len(sgRNA_dict)))
    with open(os.path.join(DATADIR, 'sgRNA_dict.pkl')) as f:
        pickle.dump(sgRNA_dict, f)
Пример #8
0
def global_align(seq1_1hot, seq2_1hot):
    """Align two 1-hot encoded sequences."""

    align_opts = {
        'gap_open_penalty': 10,
        'gap_extend_penalty': 1,
        'match_score': 5,
        'mismatch_score': -4
    }

    seq1_dna = DNA(dna_io.hot1_dna(seq1_1hot))
    seq2_dna = DNA(dna_io.hot1_dna(seq2_1hot))
    # seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, *align_opts)[0]
    seq_align = global_pairwise_align_nucleotide(seq1_dna,
                                                 seq2_dna,
                                                 gap_open_penalty=10,
                                                 gap_extend_penalty=1,
                                                 match_score=5,
                                                 mismatch_score=-4)[0]
    seq1_align = str(seq_align[0])
    seq2_align = str(seq_align[1])
    return seq1_align, seq2_align
Пример #9
0
def _construct(record, constructor=None, **kwargs):
    seq, md, pmd = record
    if constructor is None:
        unit = md['ID']['unit'].lower()
        if unit == 'bp':
            constructor = DNA
        elif unit == 'aa':
            constructor = Protein
    if constructor == RNA:
        return DNA(seq, metadata=md, positional_metadata=pmd,
                   **kwargs).transcribe()
    else:
        return constructor(seq, metadata=md, positional_metadata=pmd, **kwargs)
Пример #10
0
def get_primer_positions(primer_seqs, reference_seq):
    # hash map to hold start, stop positions for primers
    d = {}
    
    for p in primer_seqs.items():
        qname, qseq = p
        if 'RIGHT' in qname:  # mind the reverse complement
            qseq = str(DNA(qseq).reverse_complement())

        # align primer to reference using (striped) Smith-Waterman
        msa, aln_score, pos = local_pairwise_align_ssw(
            DNA(qseq), DNA(reference_seq))
        
        _, rpos = pos
        pstart, pend = rpos
        pspan = range(pstart, pend + 1)  # pspan .. primer span
        # + 1 bc/ the alignment is inclusive of last position while the fn 
        # range (Python in general) is not
    
        # contains start, end position of primer on ref
        d[pstart] = qname
        d[pend] = qname
    return d
Пример #11
0
 def test_multiple_sequence_alignment(self):
     """Test multiple sequence alignment.
     """
     seqs = [
         DNA('caccggcggcccggtggtggccattattattgggtctaaag', id='seq_1'),
         DNA('caccggcggcccgagtggtggccattattattgggtcaagg', id='seq_2'),
         DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'),
         DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'),
         DNA('caccgggcccgagtggtggccattattattgggtctaaag', id='seq_5')
     ]
     seqs_col = SequenceCollection(seqs)
     seqs_fp = join(self.working_dir, "seqs.fna")
     with open(seqs_fp, 'w') as o:
         o.write(seqs_col.to_fasta())
     alignment = multiple_sequence_alignment(seqs_fp)
     align_exp = [
         DNA('caccggcggcccg-gtggtggccattattattgggtctaaag', id='seq_1'),
         DNA('caccggcggcccgagtggtggccattattattgggtcaagg-', id='seq_2'),
         DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'),
         DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'),
         DNA('caccg--ggcccgagtggtggccattattattgggtctaaag', id='seq_5')
     ]
     self.assertItemsEqual(alignment, align_exp)
Пример #12
0
def get_rev_primer_seqs(mapping_fp):
    """ Parses mapping file to get dictionary of SampleID:Rev primer
    mapping_fp:  mapping filepath
    """
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_fp, has_barcodes=False,
                       disable_primer_check=True)

    if errors:
        for curr_err in errors:
            if curr_err.startswith("Duplicate SampleID"):
                raise ValueError('Errors were found with mapping file, ' +
                                 'please run validate_mapping_file.py to ' +
                                 'identify problems.')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    reverse_primers = {}

    for curr_id in id_map.keys():
        try:
            reverse_primers[curr_id] =\
                [str(DNA(curr_rev_primer).rc()) for curr_rev_primer in
                 id_map[curr_id]['ReversePrimer'].split(',')]
        except KeyError:
            raise KeyError("Reverse primer not found in mapping file, " +
                           "please include a 'ReversePrimer' column.")

    # Check for valid reverse primers
    # Will have been detected as warnings from mapping file
    for curr_err in errors:
        if curr_err.startswith("Invalid DNA sequence detected"):
            raise ValueError(
                "Problems found with reverse primers, please " +
                "check mapping file with validate_mapping_file.py")

    return reverse_primers
Пример #13
0
def find_guide_context(gene, context, sense):
    '''
    :gene: symbol of the gene
    :context: 30mer
    :sense: boolean wether in sense or antisense
    :returns: (species, chromosome, cut_position) where cut_position is 0-base-index
    '''

    if gene in MOUSE_GENES:
        species = 'mm10'
        df = read_mm10()
    else:
        species = 'hg38'
        df = read_hg38()

    try:
        gene_data = df[((df.gene_name == gene) | (df.gene_id == gene[:15]))
                       & (df.feature == 'gene')].iloc[0].copy()
    except IndexError:
        print(f'didnot find context {context} in gene {gene}. Sense: {sense}')
        return species, 'chrNaN', -1
    gene_data.start -= 1
    absolute_sense = (sense == (gene_data.strand == '+'))
    chromosome = gene_data.seqname
    # only if guide and gene strand are not the same
    if not absolute_sense:
        context = str(DNA(context).reverse_complement())
    chr_seq = chromosomes(species)[chromosome]

    index = chr_seq.find(context)
    while index != -1:
        cut_position = cut_position_from_index(index, absolute_sense, chr_seq)
        if cut_position >= gene_data.start and cut_position < gene_data.end:
            # it's inside the gene. go on
            break
        else:
            index = chr_seq.find(context, index + len(context))

    if index == -1:
        print(f'didnot find context {context} in gene {gene}. Sense: {sense}')
        return species, chromosome, -1

    return species, chromosome, cut_position
Пример #14
0
def _construct(record, constructor=None, **kwargs):
    '''Construct the object of Sequence, DNA, RNA, or Protein.
    '''
    seq, md, imd = record
    if 'lowercase' not in kwargs:
        kwargs['lowercase'] = True
    if constructor is None:
        unit = md['LOCUS']['unit']
        if unit == 'bp':
            # RNA mol type has T instead of U for genbank from from NCBI
            constructor = DNA
        elif unit == 'aa':
            constructor = Protein

    if constructor == RNA:
        return DNA(seq, metadata=md, interval_metadata=imd,
                   **kwargs).transcribe()
    else:
        return constructor(seq, metadata=md, interval_metadata=imd, **kwargs)
Пример #15
0
def _context_guide(exon_id,
                   start,
                   guide_direction,
                   chromosome,
                   context_length=5):
    '''
    :exon_id: ensembl id
    :start: bp position start of guide(!) relative to chromosome
    :guide_direction: either 'FWD' or 'RVS'
    :chromosome: the chromosome this is on
    :context_length: option to adjust padding in bps TODO: implement
    :returns: azimuth compliant context 30mers (that is 5bp+protospacer+5bp) in
        capital letters
    '''
    exon = gencode_exons().loc[exon_id]

    if isinstance(exon, pd.DataFrame):
        exon = exon[exon.seqname == chromosome]
        if len(exon.start.unique()) != 1:
            logging.error(
                f'azimuth.py: same exon_id with different starts {exon}')
        exon = exon.iloc[0]

    if guide_direction == 'RVS':
        start -= 3
    else:
        start -= 4

    seq = chromosomes()[exon['seqname']][start:start + 30].upper()

    # if the strands don't match, it needs to be reversed
    if guide_direction == 'RVS':
        seq = str(DNA(seq).reverse_complement())

    assert seq[25:27] == 'GG', \
        'the generated context is invalid (PAM) site. {}, {}, {}'.format(
        seq, exon['strand'], guide_direction)
    return seq
Пример #16
0
def _find_context(guide, chromosome, position):
    '''
    :returns: the 30mer context, 'sense' or 'antisense', percent peptide and
    amino acid cut position
    '''
    OFFSET = 50
    if isinstance(position, str):
        position = int(position)
    with open(CHROMOSOME_RAW_FILE.format(chromosome)) as f:
        f.seek(position - OFFSET)
        seq = f.read(2 * OFFSET + 20).upper()
        index = seq.find(guide)
        if index == -1:
            rev_seq = str(DNA(seq).reverse_complement())
            rev_index = rev_seq.find(guide)
            assert rev_index >= 0, f'guide not found.. {chromosome} {position}'
            ret = rev_seq[rev_index - 4:rev_index + 23 + 3]
        else:
            ret = seq[index - 4:index + 23 + 3]
        if ret[25:27] != 'GG':
            print(
                f'gg required... {chromosome} {position} {guide}. Dropping :/')
            return None, None
        return ret, 'sense' if index == -1 else 'antisense'
def hamming_dist(seq1, seq2):
    '''Computes the Hamming distance between DNA sequences.'''

    # List of degen DNA characters.
    degen_char = ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"]

    # If no degenerate characters in seqs then return Hamming distance.
    if not any(degen in seq1 + seq2 for degen in degen_char):

        return (DNA(seq1).distance(DNA(seq2)))

    # Otherwise compare degenerate positions separately.
    else:

        # List that will contain all degen characters to be compared separately
        # along with the corresponding nucleotide of the other sequence, which
        # isn't necessarily degenerate.
        seq1_removed = []
        seq2_removed = []

        initial_length = len(seq1)

        # Loop over all degenerate characters and check if they are present in
        # either sequence.
        for degen in degen_char:
            while degen in seq1:
                match_i = seq1.index(degen)
                seq1_removed.append(seq1[match_i])
                seq1 = seq1[0:match_i] + seq1[match_i + 1:]

                seq2_removed.append(seq2[match_i])
                seq2 = seq2[0:match_i] + seq2[match_i + 1:]

            while degen in seq2:
                match_i = seq2.index(degen)
                seq1_removed.append(seq1[match_i])
                seq1 = seq1[0:match_i] + seq1[match_i + 1:]

                seq2_removed.append(seq2[match_i])
                seq2 = seq2[0:match_i] + seq2[match_i + 1:]

        if len(seq1) > 0:
            nondegen_diff = DNA(seq1).distance(DNA(seq2)) * len(seq1)
        else:
            nondegen_diff = 0

        # Initialize # of diff for degenerate sites.
        degen_diff = 0

        # Loop over all degenerate sites and compare all options in each
        # sequence.
        # The # of differences at this site will be the proportion of
        # comparisons which differed.
        for i, degen_char in enumerate(seq1_removed):

            seq1_char_options = list(DNA(str(degen_char)).expand_degenerates())
            seq2_char_options = list(
                DNA(str(seq2_removed[i])).expand_degenerates())

            num_diff = 0
            total_compare = 0

            for seq1_opt in seq1_char_options:
                for seq2_opt in seq2_char_options:

                    total_compare += 1

                    if seq1_opt != seq2_opt:
                        num_diff += 1

            degen_diff += num_diff / total_compare

        return ((nondegen_diff + degen_diff) / initial_length)
def main():

    parser = argparse.ArgumentParser(
        description="Demultiplex gzipped FASTQ based on barcodes present in "
        "readnames (not in sequence). The metadata file should be "
        "tab-delimited with one column named \"SampleID\" and one "
        "column named \"BarcodeSequence\". The barcodes are "
        "assumed to be at the end of the read names, before "
        "\"/1\" or \"/2\" if the reads are paired-end.",
        epilog='''Usage example:

python3 demult_barcode_readnames.py -f FASTQ -m METADATA -s data1_R1 -o \
OUTPUT_FOLDER

''',
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f",
                        "--fastq",
                        metavar="FASTQ",
                        type=str,
                        help="Path to gzipped FASTQ file",
                        required=True)

    parser.add_argument("-m",
                        "--meta",
                        metavar="METADATA",
                        type=str,
                        help="Path to sample metadata file",
                        required=True)

    parser.add_argument("-s",
                        "--suffix",
                        metavar="SUFFIX",
                        type=str,
                        help="String to append to the end of each "
                        "output filename (before fastq.gz)",
                        required=False)

    parser.add_argument("-o",
                        "--output",
                        metavar="OUTPUT_FOLDER",
                        type=str,
                        help="Output folder to write FASTQs",
                        required=False,
                        default="output_demult")

    parser.add_argument("-e",
                        "--errors",
                        metavar="FLOAT",
                        type=float,
                        help="Number of errors allowed in barcode",
                        required=False,
                        default=1.5)

    parser.add_argument("-r",
                        "--revcomp",
                        action="store_true",
                        help="Flag to indicate that barcodes in "
                        "sample_metadata file should be reverse "
                        "complemented before matching.",
                        required=False,
                        default=False)

    parser.add_argument("--maxN",
                        metavar="INT",
                        type=int,
                        help="Max number of N characters allowed in read "
                        "barcode.",
                        required=False,
                        default=1)

    parser.add_argument("--force",
                        action="store_true",
                        help="Flag to indicate that command should be run "
                        "even if output folder exists",
                        required=False,
                        default=False)

    args = parser.parse_args()

    # Check if output directory exists.
    if os.path.exists(args.output):
        if not args.force:
            sys.exit("Output directory exists and --force option not set so "
                     "stopping.")
    else:
        os.makedirs(args.output)

    # Intitialize dict to keep track of all filehandles.
    sample_fh = {}

    # Set var to identify header of file.
    first_line = True

    # Read through sample metadata file and create filehandle for each barcode.
    with open(args.meta, "rt") as meta_in:
        for meta_line in meta_in:

            # Strip off line terminator and split on tabs.
            meta_line = meta_line.rstrip("\r\n")
            meta_line_split = meta_line.split("\t")

            # If line one then figure out which columns are SampleId and
            # BarcodeSequence
            if first_line:

                if "SampleID" in meta_line_split:
                    sample_col = meta_line_split.index("SampleID")
                elif "#SampleID" in meta_line_split:
                    sample_col = meta_line_split.index("#SampleID")
                else:
                    sys.exit("No column named \"SampleID\" or \"\#SampleID\""
                             " in metadata file")

                if "BarcodeSequence" in meta_line_split:
                    barcode_col = meta_line_split.index("BarcodeSequence")
                else:
                    sys.exit("No column named \"BarcodeSequence\" in metadata"
                             " file")

                first_line = False
                continue

            # Otherwise identify sample and barcode combo and open filehandle.
            sample = meta_line_split[sample_col]
            barcode = meta_line_split[barcode_col]

            # Take reverse complement of barcode if --revcomp set.
            if args.revcomp:
                barcode = str(
                    DNA(barcode, validate=True,
                        lowercase=True).reverse_complement())

            outfile = sample + ".fastq.gz"

            if args.suffix:
                outfile = outfile + "_" + args.suffix

            outfile = os.path.join(args.output, outfile)

            sample_fh[barcode] = gzip.open(outfile, "wt")

            print("Writing reads for sample " + sample + " with barcode " +
                  barcode + " to file " + outfile,
                  file=sys.stderr)

    # Also open output file for reads which cannot be demultiplexed.
    unknown_out = "unknown.fastq.gz"
    if args.suffix:
        unknown_out = unknown_out + "_" + args.suffix

    unknown_out = os.path.join(args.output, unknown_out)

    sample_fh["unknown"] = gzip.open(unknown_out, "wt")

    print("Writing reads with unknown barcode to " + unknown_out,
          file=sys.stderr)
    # Check that all barcodes are the same length.
    barcode_lengths = set()
    for b in sample_fh.keys():
        if b == "unknown":
            continue
        barcode_lengths.add(len(b))

    if len(barcode_lengths) > 1:
        sys.exit("Error barcodes in metadata file are of varying lengths.")

    barcode_length = list(barcode_lengths)[0]

    # Initialize fastq line counter (every 4th line is header).
    fastq_lc = 4

    # Read through FASTQ and demultiplex based on barcode matches.
    with gzip.open(args.fastq, 'rt') as fastq_in:
        for fastq_line in fastq_in:

            # If 4th line
            if fastq_lc == 4:
                last_barcode = None

                # Check if any barcode is within args$errors of this seq's
                # barcode.
                last_barcode = barcode_match(fastq_line, sample_fh.keys(),
                                             args.errors, barcode_length,
                                             args.maxN)

                fastq_lc = 1
            else:
                fastq_lc += 1

            print(fastq_line, file=sample_fh[last_barcode], end='')

    # Loop through all files and close filehandles.
    for fh in sample_fh.values():
        fh.close()
Пример #19
0
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_offset=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2

    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]

    if phred_offset is None:
        post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
        if post_casava_v180:
            phred_offset = 33
        else:
            phred_offset = 64

    if phred_offset == 33:
        check_header_match_f = check_header_match_180_or_later
    elif phred_offset == 64:
        check_header_match_f = check_header_match_pre180
    else:
        raise ValueError("Invalid PHRED offset: %d" % phred_offset)

    # compute the barcode length, if they are all the same.
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set(
        [len(bc) for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None

    # compute the minimum read length as a fraction of the length of the input
    # read
    min_per_read_length = min_per_read_length_fraction * \
        len(fastq_read_f_line2)

    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data, read_data in izip(
            parse_fastq(fastq_barcode_f,
                        strict=False,
                        phred_offset=phred_offset),
            parse_fastq(fastq_read_f, strict=False,
                        phred_offset=phred_offset)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index], read_data[header_index])):
            raise FastqParseError(
                "Headers of barcode and read do not match. Can't continue. "
                "Confirm that the barcode fastq and read fastq that you are "
                "passing match one another.")
        else:
            header = read_data[header_index]

        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for
            # techical reasons, this step looks only at the
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = str(DNA(barcode).rc())
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]

        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
            correct_barcode(
                barcode,
                barcode_to_sample_id,
                barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
            count_barcode_errors_exceed_max += 1
            continue

        # skip unassignable samples unless otherwise requested
        if sample_id is None:
            if not store_unassigned:
                count_barcode_not_in_map += 1
                continue
            else:
                sample_id = 'Unassigned'

        quality_filter_result, sequence, quality =\
            quality_filter_sequence(header,
                                    sequence,
                                    quality,
                                    max_bad_run_length,
                                    phred_quality_threshold,
                                    min_per_read_length,
                                    seq_max_N,
                                    filter_bad_illumina_qual_digit)

        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError("Unknown quality filter result: %d" %
                                 quality_filter_result)
            continue

        sequence_lengths.append(len(sequence))

        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1

        if rev_comp:
            sequence = str(DNA(sequence).rc())
            quality = quality[::-1]

        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
            (sample_id, seq_id, header, barcode,
             corrected_barcode, num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1

    # Add sample IDs with zero counts to dictionary for logging
    for curr_sample_id in barcode_to_sample_id.values():
        if curr_sample_id not in seqs_per_sample_counts.keys():
            seqs_per_sample_counts[curr_sample_id] = 0

    if log_f is not None:
        log_str = format_split_libraries_fastq_log(
            count_barcode_not_in_map, count_too_short, count_too_many_N,
            count_bad_illumina_qual_digit, count_barcode_errors_exceed_max,
            input_sequence_count, sequence_lengths, seqs_per_sample_counts)
        log_f.write(log_str)

    if len(sequence_lengths) and histogram_f is not None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts, bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')
Пример #20
0
def get_primers(header, mapping_data):
    """ Returns lists of forward/reverse primer regular expression generators

    header:  list of strings of header data.
    mapping_data:  list of lists of mapping data

    Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
        are not present
    """

    if "LinkerPrimerSequence" in header:
        primer_ix = header.index("LinkerPrimerSequence")
    else:
        raise IndexError(
            ("Mapping file is missing LinkerPrimerSequence field."))
    if "ReversePrimer" in header:
        rev_primer_ix = header.index("ReversePrimer")
    else:
        raise IndexError(("Mapping file is missing ReversePrimer field."))

    iupac = {
        'A': 'A',
        'T': 'T',
        'G': 'G',
        'C': 'C',
        'R': '[AG]',
        'Y': '[CT]',
        'S': '[GC]',
        'W': '[AT]',
        'K': '[GT]',
        'M': '[AC]',
        'B': '[CGT]',
        'D': '[AGT]',
        'H': '[ACT]',
        'V': '[ACG]',
        'N': '[ACGT]'
    }

    raw_forward_primers = set([])
    raw_forward_rc_primers = set([])
    raw_reverse_primers = set([])
    raw_reverse_rc_primers = set([])

    for line in mapping_data:
        # Split on commas to handle pool of primers
        raw_forward_primers.update(
            [upper(primer).strip() for primer in line[primer_ix].split(',')])
        raw_forward_rc_primers.update(
            [str(DNA(primer).rc()) for primer in raw_forward_primers])
        raw_reverse_primers.update([
            upper(primer).strip() for primer in line[rev_primer_ix].split(',')
        ])
        raw_reverse_rc_primers.update(
            [str(DNA(primer).rc()) for primer in raw_reverse_primers])

    if not raw_forward_primers:
        raise ValueError(("No forward primers detected in mapping file."))
    if not raw_reverse_primers:
        raise ValueError(("No reverse primers detected in mapping file."))

    # Finding the forward primers, or rc of reverse primers indicates forward
    # read. Finding the reverse primer, or rc of the forward primers, indicates
    # the reverse read, so these sets are merged.
    raw_forward_primers.update(raw_reverse_rc_primers)
    raw_reverse_primers.update(raw_forward_rc_primers)

    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(
            compile(''.join([iupac[symbol] for symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(
            compile(''.join([iupac[symbol] for symbol in curr_primer])))

    return forward_primers, reverse_primers
Пример #21
0
def process_barcode_in_label(read1_data,
                             read2_data,
                             output_bc_fastq,
                             bc1_len=6,
                             bc2_len=6,
                             rev_comp_bc1=False,
                             rev_comp_bc2=False,
                             char_delineator=":"):
    """ Reads data from one or two fastq labels, writes output barcodes file.

    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores, False if no read 2.
    output_bc_fastq: open output fastq filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    """
    header_index = 0

    # Check for char_delineator in sequence
    try:
        bc1_read = read1_data[header_index].split(
            char_delineator)[-1][0:bc1_len]
    # If there is an index error, it means the char_delineator wasn't found
    except IndexError:
        raise IndexError("Found sequence lacking character delineator. "
                         "Sequence header %s, character delineator %s" %
                         (read1_data[header_index], char_delineator))

    # Create fake quality scores, using 6 here to match the existing qual fake
    # qual scores that were all F.
    bc1_qual = np.ones(len(bc1_read), dtype=np.int8) * 6
    if rev_comp_bc1:
        bc1_read = str(DNA(bc1_read).rc())

    if read2_data:
        bc2_read =\
            read2_data[header_index].strip().split(
                char_delineator)[-1][0:bc2_len]
        bc2_qual = np.ones(len(bc2_read), dtype=np.int8) * 6
        if rev_comp_bc2:
            bc2_read = str(DNA(bc2_read).rc())
    else:
        bc2_read = ""
        bc2_qual = np.array([], dtype=np.int8)

    if not bc1_read and not bc2_read:
        raise ValueError("Came up with empty barcode sequence, please check "
                         "character delineator with -s, and fastq label "
                         "%s" % read1_data[header_index])

    bc_lines = format_fastq_record(read1_data[header_index],
                                   bc1_read + bc2_read,
                                   np.hstack([bc1_qual, bc2_qual]))

    output_bc_fastq.write(bc_lines)

    return
Пример #22
0
def process_barcode_paired_stitched(read_data,
                                    output_bc_fastq,
                                    output_fastq,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq_out_not_oriented=None,
                                    switch_bc_order=False):
    """ Processes stitched barcoded reads, writes barcode, parsed stitched read

    read_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of read1 stitched data
    bc2_len: length of barcode to remove from end of read2 stitched data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq_out_not_oriented: Open filepath to write reads where primers
        can't be found when attempt_read_orientation is True.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    read_seq = read_data[sequence_index]
    read_qual = read_data[quality_index]

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        for curr_primer in forward_primers:
            if curr_primer.search(read_data[sequence_index]):
                found_primer_match = True
                break
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read_data[sequence_index]):
                    read_seq = str(DNA(read_seq).rc())
                    read_qual = read_qual[::-1]
                    found_primer_match = True
                    break

    if not found_primer_match and attempt_read_orientation:
        output_bc = output_bc_not_oriented
        output_read = fastq_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read = output_fastq

    bc_read1 = read_seq[0:bc1_len]
    bc_read2 = read_seq[-bc2_len:]
    bc_qual1 = read_qual[0:bc1_len]
    bc_qual2 = read_qual[-bc2_len:]

    if rev_comp_bc1:
        bc_read1 = str(DNA(bc_read1).rc())
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = str(DNA(bc_read2).rc())
        bc_qual2 = bc_qual2[::-1]

    if switch_bc_order:
        bc_read1, bc_read2 = bc_read2, bc_read1
        bc_qual1, bc_qual2 = bc_qual2, bc_qual1

    bc_lines = format_fastq_record(read_data[header_index],
                                   bc_read1 + bc_read2,
                                   np.hstack([bc_qual1, bc_qual2]))
    output_bc.write(bc_lines)
    seq_lines = format_fastq_record(read_data[header_index],
                                    read_seq[bc1_len:-bc2_len],
                                    read_qual[bc1_len:-bc2_len])
    output_read.write(seq_lines)

    return
Пример #23
0
def process_barcode_paired_end_data(read1_data,
                                    read2_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    output_fastq2,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq1_out_not_oriented=None,
                                    fastq2_out_not_oriented=None):
    """ Processes, writes paired-end barcode data, parsed sequences

    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads 1 filepath
    output_fastq2: open output fastq reads 2 filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq1_out_not_oriented: Open filepath to write reads 1 where primers
        can't be found when attempt_read_orientation is True.
    fastq2_out_not_oriented: Open filepath to write reads 2 where primers
        can't be found when attempt_read_orientation is True.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2
    bc1_end = None
    bc2_end = None
    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        # First check forward primers
        for curr_primer in forward_primers:
            if curr_primer.search(read1_data[sequence_index]):
                read1 = read1_data
                read2 = read2_data
                found_primer_match = True

                bc1_end = curr_primer.search(
                    read1_data[sequence_index]).start()  # self_add  by liaoth

                break
            if curr_primer.search(read2_data[sequence_index]):
                read1 = read2_data
                read2 = read1_data
                found_primer_match = True
                bc1_end = curr_primer.search(
                    read2_data[sequence_index]).start()  # self_add  by liaoth

                break
        # Check reverse primers if forward primers not found
        if found_primer_match:
            for curr_primer in reverse_primers:  # self_mod  by liaoth
                if curr_primer.search(read1_data[sequence_index]):
                    read1 = read2_data
                    read2 = read1_data
                    found_primer_match = True
                    bc2_end = curr_primer.search(
                        read1_data[sequence_index]).start(
                        )  # self_add  by liaoth
                    break
                if curr_primer.search(read2_data[sequence_index]):
                    read1 = read1_data
                    read2 = read2_data
                    found_primer_match = True
                    bc2_end = curr_primer.search(
                        read2_data[sequence_index]).start(
                        )  # self_add  by liaoth
                    break
                #if reverse_primers.index(curr_primer) == 1:
                #    import pdb;pdb.set_trace()
                found_primer_match = False
    else:
        read1 = read1_data
        read2 = read2_data

    if not found_primer_match and attempt_read_orientation:
        read1 = read1_data
        read2 = read2_data
        output_bc = output_bc_not_oriented
        output_read1 = fastq1_out_not_oriented
        output_read2 = fastq2_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read1 = output_fastq1
        output_read2 = output_fastq2

    if bc1_end and bc2_end:  # self_add  by liaoth
        #print 'test successed'
        bc_read1 = read1[sequence_index][bc1_end - bc1_len:
                                         bc1_end]  # self_add  by liaoth
        bc_read2 = read2[sequence_index][bc2_end - bc2_len:
                                         bc2_end]  # self_add  by liaoth
        bc_qual1 = read1[quality_index][bc1_end -
                                        bc1_len:bc1_end]  # self_add  by liaoth
        bc_qual2 = read2[quality_index][bc2_end -
                                        bc2_len:bc2_end]  # self_add  by liaoth
    else:  # self_add  by liaoth
        bc_read1 = read1[sequence_index][0:bc1_len]
        bc_read2 = read2[sequence_index][0:bc2_len]
        bc_qual1 = read1[quality_index][0:bc1_len]
        bc_qual2 = read2[quality_index][0:bc2_len]

    if rev_comp_bc1:
        bc_read1 = str(DNA(bc_read1).rc())
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = str(DNA(bc_read2).rc())
        bc_qual2 = bc_qual2[::-1]

    bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2,
                                   np.hstack([bc_qual1, bc_qual2]))
    output_bc.write(bc_lines)
    if found_primer_match and attempt_read_orientation:  # self_add  by liaoth

        seq1_lines = format_fastq_record(read1[header_index],
                                         read1[sequence_index][bc1_end:],
                                         read1[quality_index][bc1_end:])
        output_read1.write(seq1_lines)
        seq2_lines = format_fastq_record(read2[header_index],
                                         read2[sequence_index][bc2_end:],
                                         read2[quality_index][bc2_end:])
        output_read2.write(seq2_lines)

    else:  # self_add  by liaoth
        seq1_lines = format_fastq_record(read1[header_index],
                                         read1[sequence_index][bc1_len:],
                                         read1[quality_index][bc1_len:])
        output_read1.write(seq1_lines)
        seq2_lines = format_fastq_record(read2[header_index],
                                         read2[sequence_index][bc2_len:],
                                         read2[quality_index][bc2_len:])
        output_read2.write(seq2_lines)

    return
Пример #24
0
from skbio.alignment._pairwise import global_pairwise_align_nucleotide
from skbio.sequence import DNA

global_pairwise_align_nucleotide(DNA("GCAAAAGCTGGTATTAAAGT"),
                                 DNA("GCATATTACGTGGTGATTCAAGAGGCCTTCG"),
                                 5,
                                 1,
                                 5,
                                 -2,
                                 penalize_terminal_gaps=True)

from skbio import __version__ as v

print(v)
Пример #25
0
    def create_primer_regex_patterns(self, header, mapping_data):
        """ Returns lists of forward/reverse primer regular expression
    
            header:  list of strings of header data.
            mapping_data:  list of lists of mapping data
    
            Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
            are not present
        """
        import logging
        self.logger = logging.getLogger('_getprm_')

        if "LinkerPrimerSequence" in header:
            primer_ix = header.index("LinkerPrimerSequence")
        else:
            raise IndexError(
                ("Mapping file is missing LinkerPrimerSequence field."))
        if "ReversePrimer" in header:
            rev_primer_ix = header.index("ReversePrimer")
        else:
            raise IndexError(("Mapping file is missing ReversePrimer field."))

        raw_forward_primers = set([])

        raw_reverse_primers = set([])

        for line in mapping_data:
            # Split on commas to handle pool of primers
            raw_forward_primers.update([
                upper(primer).strip() for primer in line[primer_ix].split(',')
            ])
            # reverse primer were reverse complemented
            raw_reverse_primers.update([
                upper(str(DNA(primer)))
                for primer in line[rev_primer_ix].split(',')
            ])

        if not raw_forward_primers:
            self.logger.critical(
                "No forward primers detected in mapping file.")
            raise ValueError("No forward primers detected in mapping file.")

        if not raw_reverse_primers:
            self.logger.critical(
                "No reverse primers detected in mapping file.")
            raise ValueError("No reverse primers detected in mapping file.")

        forward_primers = []
        forward_primers_rc = []
        reverse_primers = []
        reverse_primers_rc = []

        for curr_primer in raw_forward_primers:

            forward_primers.append(
                compile(''.join([
                    self.iupac[symbol]
                    for symbol in curr_primer[:self.search_length]
                ])))
            forward_primers_rc.append(
                compile(''.join([
                    self.iupac[symbol] for symbol in self.reverse_complement(
                        curr_primer[:self.search_length])
                ])))

        for curr_primer in raw_reverse_primers:
            reverse_primers.append(
                compile(''.join([
                    self.iupac[symbol]
                    for symbol in curr_primer[:self.search_length]
                ])))
            reverse_primers_rc.append(
                compile(''.join([
                    self.iupac[symbol] for symbol in self.reverse_complement(
                        curr_primer[:self.search_length])
                ])))

        return forward_primers, forward_primers_rc, reverse_primers, reverse_primers_rc
Пример #26
0
from skbio.sequence import DNA

CS_FILE = 'q2.data'
GENOME_FILE = 'genome.data'

control_sequence_list = []

genome_list = []

with open(CS_FILE, 'r') as f:
     control_sequence_list = f.readlines()

with open(GENOME_FILE, 'r') as f:
     genome_list = f.readlines()

a = DNA(control_sequence_list[0][:-1])
print(a)
Пример #27
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                str(DNA(k).rc()): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
def main():

    parser = argparse.ArgumentParser(
        description="Slice out amplified region of gene based on forward "
        "and reverse primers, which can contain degenerate bases.",
        epilog='''Usage example:

python3 slice_amplified_region.py -i FASTA -f ACGCGHNRAACCTTACC
-r ACGGGCRGTGWGTRCAA -o OUT_FASTA

''',
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-i",
                        "--input",
                        metavar="IN_FASTA",
                        type=str,
                        help="Path to input FASTA",
                        required=True)

    parser.add_argument("-o",
                        "--output",
                        metavar="OUT_FASTA",
                        type=str,
                        help="Path to output FASTA",
                        required=True)

    parser.add_argument("-f",
                        "--forward",
                        metavar="FORWARD_PRIMER",
                        type=str,
                        help="Forward primer sequence.",
                        required=True)

    parser.add_argument("-r",
                        "--reverse",
                        metavar="REVERSE_PRIMER",
                        type=str,
                        help="Reverse primer sequence.",
                        required=True)

    parser.add_argument("--no_primer",
                        action="store_true",
                        help="Flag to indicate that primers should be removed"
                        "in output sequences.",
                        required=False)

    args = parser.parse_args()

    input_fasta = read_fasta(args.input)

    out_fasta = open(args.output, "w")

    # Get reverse complement of reverse primer:
    args.reverse = str(DNA(args.reverse).reverse_complement())

    for seq in input_fasta.keys():

        # Figure out where forward and reverse primers match.
        forward_start = seq_match_start(input_fasta[seq], args.forward)
        reverse_start = seq_match_start(input_fasta[seq], args.reverse)

        if forward_start == "multiple":
            print("Skipping",
                  seq,
                  "due to multiple matches of forward primer",
                  sep=" ",
                  file=sys.stderr)
            continue
        elif reverse_start == "multiple":
            print("Skipping",
                  seq,
                  "due to multiple matches of reverse primer",
                  sep=" ",
                  file=sys.stderr)
            continue
        elif forward_start is None:
            print("Forward primer not found in", seq, sep=" ", file=sys.stderr)
            continue
        elif reverse_start is None:
            print("Reverse primer not found in", seq, sep=" ", file=sys.stderr)
            continue
        elif forward_start > reverse_start:
            print("Forward primer matches after reverse in",
                  seq,
                  sep=" ",
                  file=sys.stderr)
            continue

        if args.no_primer:
            amplified_slice = input_fasta[seq][forward_start +
                                               len(args.forward):reverse_start]
        else:
            amplified_slice = input_fasta[seq][forward_start:reverse_start +
                                               len(args.reverse)]

        print(seq, file=out_fasta)
        print(amplified_slice, file=out_fasta)

    out_fasta.close()
Пример #29
0
def krtd(seq,
         k,
         overlap=True,
         reverse_complement=False,
         return_full_dict=False,
         metrics=None):
    """Calculates the :math:`k`-mer return time distribution for a sequence.

    Args:
        seq (~skbio.sequence.DNA or str): The sequence to analyze.
        k (int): The :math:`k` value to use.
        overlap (bool, optional): Whether the :math:`k`-mers should overlap.
            Defaults to True.
        reverse_complement (bool, optional): Whether to calculate distances
            between a :math:`k`-mer and its next occurrence or the distances between
            :math:`k`-mers and their reverse complements.
        return_full_dict (bool, optional): Whether to return a full dictionary
            containing every :math:`k`-mer and its RTD. For large values of
            :math:`k`, as the sparsity of the space in creased, returning a full
            dictionary may be very slow. If False, returns a
            :obj:`~collections.defaultdict`. Functionally, this should be identical
            to a full dictionary if accessing dictionary elements. Defaults to
            False.
        metrics (list): A list of functions which, if passed, will be applied to
            each RTD array.

    Warning:
        Setting ``return_full_dict=True`` will take exponentially more time and as ``k`` increases.

    Returns:
        dict: A dictionary of the shape ``{k_mer: distances}`` in which ``k_mer`` is a str and distances is a :obj:`~numpy.ndarray`. If ``metrics`` is passed, the values of the dictionary will be dictionaries mapping each function to its value (see examples below).

    Raises:
        ValueError: When the sequence is degenerate.

    Examples:
        .. runblock:: pycon

            >>> from krtd import krtd # ignore
            >>> from pprint import pprint as print # for prettier printing # ignore
            >>> import numpy as np # ignore
            >>> print(krtd("ATGCACAGTTCAGA", 1))
            >>> print(krtd("ATGCACAGTTCAGA", 1, metrics=[np.mean, np.std]))
            >>> print(krtd("ATGCACAGTTCAGA", 2, reverse_complement=True))
            >>> print(krtd("ATGATTGGATATTATGAGGA", 1)) # no value for "C" is printed since it's not in the original sequence
            >>> print(krtd("ATGATTGGATATTATGAGGA", 1, return_full_dict=True)) # now it is
    """

    # convert to DNA object
    if not isinstance(seq, DNA):
        seq = DNA(seq)

    if seq.has_degenerates():
        raise ValueError("RTD for sequences with degenerates is undefined.")

    seq = seq_to_array(seq, k=k, overlap=overlap)

    result = {}
    # only calculate RTDs of k-mers present in the seq, which is nice as sparsity increases
    for k_mer in np.unique(seq):
        dists = distance_between_occurrences(
            seq,
            k_mer,
            k_mer
            if not reverse_complement else DNA(k_mer).reverse_complement(),
            overlap=overlap,
        )
        if metrics:
            dists = _analyze_rtd(dists, metrics)
        result[k_mer] = dists

    # fill in the result dictionary (expensive!)
    if return_full_dict:
        for k_mer in ("".join(_k_mer)
                      for _k_mer in itertools.product("ATGC", repeat=k)):
            if k_mer not in result:
                dists = np.empty(0, dtype="int64")
                if metrics:
                    dists = _analyze_rtd(dists, metrics)
                result[k_mer] = dists

    return result
def match_seqs(seq1, seq2):
    '''Determine whether two sequences of the same length and possibly
    containing degenerate bases match.'''

    # List of degen DNA characters.
    degen_char = ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"]

    # If no degenerate characters in seqs then quickly determine if the
    # sequences are the same.
    if not any(degen in seq1 + seq2 for degen in degen_char):

        if seq1 == seq2:
            return (True)
        else:
            return (False)

    # List that will contain all degen characters to be compared separately
    # along with the corresponding nucleotide of the other sequence, which
    # isn't necessarily degenerate.
    seq1_removed = []
    seq2_removed = []

    # Loop over all degenerate characters and check if they are present in
    # either sequence.
    for degen in degen_char:
        while degen in seq1:
            match_i = seq1.index(degen)
            seq1_removed.append(seq1[match_i])
            seq1 = seq1[0:match_i] + seq1[match_i + 1:]

            seq2_removed.append(seq2[match_i])
            seq2 = seq2[0:match_i] + seq2[match_i + 1:]

        while degen in seq2:
            match_i = seq2.index(degen)
            seq1_removed.append(seq1[match_i])
            seq1 = seq1[0:match_i] + seq1[match_i + 1:]

            seq2_removed.append(seq2[match_i])
            seq2 = seq2[0:match_i] + seq2[match_i + 1:]

    # Return False if seqs don't match after removing dengenerate bases.
    if len(seq1) > 0:
        if seq1 != seq2:
            return (False)

    # Loop over all degenerate sites and compare all options in each
    # sequence.
    # The # of differences at this site will be the proportion of
    # comparisons which differed.
    for i, degen_char in enumerate(seq1_removed):

        seq1_char_options = list(DNA(str(degen_char)).expand_degenerates())
        seq2_char_options = list(
            DNA(str(seq2_removed[i])).expand_degenerates())

        degen_match = False

        for seq1_opt in seq1_char_options:
            for seq2_opt in seq2_char_options:

                if seq1_opt == seq2_opt:
                    degen_match = True

        if not degen_match:
            return (False)

    return (True)