def seq_to_array(seq, k=1, overlap=True): """Converts a DNA sequence into a Numpy vector. If :math:`k>1`, then it creates a vector of the :math:`k`-mers. Args: seq (~skbio.sequence.DNA or str): The sequence to convert. k (int, optional): The :math:`k` value to use. Defaults to 1. overlap (bool, optional): Whether the :math:`k`-mers should overlap. Defaults to True. Returns: ~numpy.ndarray: An array representing the sequence. Examples: .. runblock:: pycon >>> from krtd import seq_to_array # ignore >>> seq_to_array("ATGC") >>> seq_to_array("ATGC", k=2) >>> seq_to_array("ATGC", k=2, overlap=False) """ # convert to DNA object if not isinstance(seq, DNA): seq = DNA(seq) return np.fromiter( (str(k_mer) for k_mer in seq.iter_kmers(k=k, overlap=overlap)), "<U" + str(k))
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = str(DNA(bc_read).rc()) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record(read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]) output_fastq1.write(seq_lines) return
def parse_illumina_line(l, barcode_length, rev_comp_barcode, barcode_in_sequence=False): """Parses a single line of Illumina data """ fields = l.strip().split(':') y_position_subfields = fields[4].split('#') y_position = int(y_position_subfields[0]) sequence = fields[5] qual_string = fields[6] if barcode_in_sequence: barcode = sequence[:barcode_length] sequence = sequence[barcode_length:] qual_string = qual_string[barcode_length:] else: barcode = y_position_subfields[1][:barcode_length] if rev_comp_barcode: barcode = str(DNA(barcode).rc()) result = { 'Full description': ':'.join(fields[:5]), 'Machine Name': fields[0], 'Channel Number': int(fields[1]), 'Tile Number': int(fields[2]), 'X Position': int(fields[3]), 'Y Position': y_position, 'Barcode': barcode, 'Full Y Position Field': fields[4], 'Sequence': sequence, 'Quality Score': qual_string} return result
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc): """ """ for seq_id, seq in parse_fasta(fasta_lines): seq_id = seq_desc_mapper(seq_id) seq = str(DNA(seq.upper()).rc()) yield seq_id, seq return
def _construct(record, constructor=None, **kwargs): seq, md = record if constructor is None: constructor = Sequence if constructor == RNA: return DNA(seq, metadata=md, **kwargs).transcribe() else: return constructor(seq, metadata=md, **kwargs)
def identity_coverage(dna_query, protein_query, dna_target, protein_target): """ def category(query, dna_seq, protein_seq): if identity_coverage(query, dna_seq) >= (0.95, 0.95): return "EXACT" if identity_coverage(query, protein_seq) >= (0.8, 0.8): return "SIMILAR" if identity_coverage(query, protein_seq) >= (0.5, 0.5): return "MATCH" return "NO MATCH" """ if dna_query != '': try: sw_dna = skbio.alignment.local_pairwise_align_ssw( DNA(dna_query), DNA(dna_target)) except: sw_dna = skbio.alignment.local_pairwise_align_nucleotide( DNA(dna_query), DNA(dna_target)) dna_identity, align_length = extract_sw(sw_dna) dna_coverage = align_length / min(len(dna_query), len(dna_target)) if dna_identity >= 0.95 and dna_coverage >= 0.95: return 'EXACT' try: sw_protein = skbio.alignment.local_pairwise_align_ssw( Protein(protein_query), Protein(protein_target), substitution_matrix=blosum62, gap_open_penalty=11, gap_extend_penalty=1) except: sw_protein = skbio.alignment.local_pairwise_align_protein( Protein(protein_query), Protein(protein_target), substitution_matrix=blosum62, gap_open_penalty=11, gap_extend_penalty=1) protein_identity, align_length = extract_sw(sw_protein) protein_coverage = align_length / min(len(protein_query), len(protein_target)) if protein_identity >= 0.8 and protein_coverage >= 0.8: return 'SIMILAR' if protein_identity >= 0.5 and protein_coverage >= 0.5: return 'MATCH' return 'NO MATCH'
def find_sgRNAs(): ''' Search for PAMs and their positions of their corresponding sgRNAs Return the position just before the sgRNA, along with the strand direction. Save the sgRNAs along with their positions and included exons PAMs on the reverse strand are described similar: Example: 'tcg^acgtataaatatatcgatatNGG' would result in a tuple (3, '+') 'atttgCCNgateagctcgatctattata^tgat' would result in a tuple (8, '-') ''' with open(EXON_INTERVAL_TREES_FILE, 'rb') as f: exon_interval_trees = pickle.load(f) sgRNA_count = 0 sgRNA_dict = {} sgRNA_collection.drop() logging.info('Old sgRNA collection deleted') for chromosome in CHROMOSOMES: logging.info('find pams in {}'.format(chromosome)) with open(CHROMOSOME_RAW_FILE.format(chromosome)) as chr_file: chr_sequence = DNA(chr_file.read().upper()) for strand in ['+', '-']: # for the reverse strand, inversecomplement the chromosome if strand == '-': chr_sequence = chr_sequence.reverse_complement() # 20 Protospacer + 1 PAM-nucleotide, find overlapping sequences for guide_position in chr_sequence. \ find_with_regex('(?=([ACTG]{20})[ACTG]GG)'): process_sgRNA(guide_position, chr_sequence, chromosome, strand, exon_interval_trees) try: sgRNA_dict[kmer_to_int( chr_sequence[guide_position])] += 1 except KeyError: sgRNA_dict[kmer_to_int( chr_sequence[guide_position])] = 1 sgRNA_count += 1 logging.info('Found {} sgRNA sites'.format(sgRNA_count)) logging.info('Found {} distinct protospacers'.format(len(sgRNA_dict))) with open(os.path.join(DATADIR, 'sgRNA_dict.pkl')) as f: pickle.dump(sgRNA_dict, f)
def global_align(seq1_1hot, seq2_1hot): """Align two 1-hot encoded sequences.""" align_opts = { 'gap_open_penalty': 10, 'gap_extend_penalty': 1, 'match_score': 5, 'mismatch_score': -4 } seq1_dna = DNA(dna_io.hot1_dna(seq1_1hot)) seq2_dna = DNA(dna_io.hot1_dna(seq2_1hot)) # seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, *align_opts)[0] seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, gap_open_penalty=10, gap_extend_penalty=1, match_score=5, mismatch_score=-4)[0] seq1_align = str(seq_align[0]) seq2_align = str(seq_align[1]) return seq1_align, seq2_align
def _construct(record, constructor=None, **kwargs): seq, md, pmd = record if constructor is None: unit = md['ID']['unit'].lower() if unit == 'bp': constructor = DNA elif unit == 'aa': constructor = Protein if constructor == RNA: return DNA(seq, metadata=md, positional_metadata=pmd, **kwargs).transcribe() else: return constructor(seq, metadata=md, positional_metadata=pmd, **kwargs)
def get_primer_positions(primer_seqs, reference_seq): # hash map to hold start, stop positions for primers d = {} for p in primer_seqs.items(): qname, qseq = p if 'RIGHT' in qname: # mind the reverse complement qseq = str(DNA(qseq).reverse_complement()) # align primer to reference using (striped) Smith-Waterman msa, aln_score, pos = local_pairwise_align_ssw( DNA(qseq), DNA(reference_seq)) _, rpos = pos pstart, pend = rpos pspan = range(pstart, pend + 1) # pspan .. primer span # + 1 bc/ the alignment is inclusive of last position while the fn # range (Python in general) is not # contains start, end position of primer on ref d[pstart] = qname d[pend] = qname return d
def test_multiple_sequence_alignment(self): """Test multiple sequence alignment. """ seqs = [ DNA('caccggcggcccggtggtggccattattattgggtctaaag', id='seq_1'), DNA('caccggcggcccgagtggtggccattattattgggtcaagg', id='seq_2'), DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'), DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'), DNA('caccgggcccgagtggtggccattattattgggtctaaag', id='seq_5') ] seqs_col = SequenceCollection(seqs) seqs_fp = join(self.working_dir, "seqs.fna") with open(seqs_fp, 'w') as o: o.write(seqs_col.to_fasta()) alignment = multiple_sequence_alignment(seqs_fp) align_exp = [ DNA('caccggcggcccg-gtggtggccattattattgggtctaaag', id='seq_1'), DNA('caccggcggcccgagtggtggccattattattgggtcaagg-', id='seq_2'), DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'), DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'), DNA('caccg--ggcccgagtggtggccattattattgggtctaaag', id='seq_5') ] self.assertItemsEqual(alignment, align_exp)
def get_rev_primer_seqs(mapping_fp): """ Parses mapping file to get dictionary of SampleID:Rev primer mapping_fp: mapping filepath """ hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_fp, has_barcodes=False, disable_primer_check=True) if errors: for curr_err in errors: if curr_err.startswith("Duplicate SampleID"): raise ValueError('Errors were found with mapping file, ' + 'please run validate_mapping_file.py to ' + 'identify problems.') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] reverse_primers = {} for curr_id in id_map.keys(): try: reverse_primers[curr_id] =\ [str(DNA(curr_rev_primer).rc()) for curr_rev_primer in id_map[curr_id]['ReversePrimer'].split(',')] except KeyError: raise KeyError("Reverse primer not found in mapping file, " + "please include a 'ReversePrimer' column.") # Check for valid reverse primers # Will have been detected as warnings from mapping file for curr_err in errors: if curr_err.startswith("Invalid DNA sequence detected"): raise ValueError( "Problems found with reverse primers, please " + "check mapping file with validate_mapping_file.py") return reverse_primers
def find_guide_context(gene, context, sense): ''' :gene: symbol of the gene :context: 30mer :sense: boolean wether in sense or antisense :returns: (species, chromosome, cut_position) where cut_position is 0-base-index ''' if gene in MOUSE_GENES: species = 'mm10' df = read_mm10() else: species = 'hg38' df = read_hg38() try: gene_data = df[((df.gene_name == gene) | (df.gene_id == gene[:15])) & (df.feature == 'gene')].iloc[0].copy() except IndexError: print(f'didnot find context {context} in gene {gene}. Sense: {sense}') return species, 'chrNaN', -1 gene_data.start -= 1 absolute_sense = (sense == (gene_data.strand == '+')) chromosome = gene_data.seqname # only if guide and gene strand are not the same if not absolute_sense: context = str(DNA(context).reverse_complement()) chr_seq = chromosomes(species)[chromosome] index = chr_seq.find(context) while index != -1: cut_position = cut_position_from_index(index, absolute_sense, chr_seq) if cut_position >= gene_data.start and cut_position < gene_data.end: # it's inside the gene. go on break else: index = chr_seq.find(context, index + len(context)) if index == -1: print(f'didnot find context {context} in gene {gene}. Sense: {sense}') return species, chromosome, -1 return species, chromosome, cut_position
def _construct(record, constructor=None, **kwargs): '''Construct the object of Sequence, DNA, RNA, or Protein. ''' seq, md, imd = record if 'lowercase' not in kwargs: kwargs['lowercase'] = True if constructor is None: unit = md['LOCUS']['unit'] if unit == 'bp': # RNA mol type has T instead of U for genbank from from NCBI constructor = DNA elif unit == 'aa': constructor = Protein if constructor == RNA: return DNA(seq, metadata=md, interval_metadata=imd, **kwargs).transcribe() else: return constructor(seq, metadata=md, interval_metadata=imd, **kwargs)
def _context_guide(exon_id, start, guide_direction, chromosome, context_length=5): ''' :exon_id: ensembl id :start: bp position start of guide(!) relative to chromosome :guide_direction: either 'FWD' or 'RVS' :chromosome: the chromosome this is on :context_length: option to adjust padding in bps TODO: implement :returns: azimuth compliant context 30mers (that is 5bp+protospacer+5bp) in capital letters ''' exon = gencode_exons().loc[exon_id] if isinstance(exon, pd.DataFrame): exon = exon[exon.seqname == chromosome] if len(exon.start.unique()) != 1: logging.error( f'azimuth.py: same exon_id with different starts {exon}') exon = exon.iloc[0] if guide_direction == 'RVS': start -= 3 else: start -= 4 seq = chromosomes()[exon['seqname']][start:start + 30].upper() # if the strands don't match, it needs to be reversed if guide_direction == 'RVS': seq = str(DNA(seq).reverse_complement()) assert seq[25:27] == 'GG', \ 'the generated context is invalid (PAM) site. {}, {}, {}'.format( seq, exon['strand'], guide_direction) return seq
def _find_context(guide, chromosome, position): ''' :returns: the 30mer context, 'sense' or 'antisense', percent peptide and amino acid cut position ''' OFFSET = 50 if isinstance(position, str): position = int(position) with open(CHROMOSOME_RAW_FILE.format(chromosome)) as f: f.seek(position - OFFSET) seq = f.read(2 * OFFSET + 20).upper() index = seq.find(guide) if index == -1: rev_seq = str(DNA(seq).reverse_complement()) rev_index = rev_seq.find(guide) assert rev_index >= 0, f'guide not found.. {chromosome} {position}' ret = rev_seq[rev_index - 4:rev_index + 23 + 3] else: ret = seq[index - 4:index + 23 + 3] if ret[25:27] != 'GG': print( f'gg required... {chromosome} {position} {guide}. Dropping :/') return None, None return ret, 'sense' if index == -1 else 'antisense'
def hamming_dist(seq1, seq2): '''Computes the Hamming distance between DNA sequences.''' # List of degen DNA characters. degen_char = ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"] # If no degenerate characters in seqs then return Hamming distance. if not any(degen in seq1 + seq2 for degen in degen_char): return (DNA(seq1).distance(DNA(seq2))) # Otherwise compare degenerate positions separately. else: # List that will contain all degen characters to be compared separately # along with the corresponding nucleotide of the other sequence, which # isn't necessarily degenerate. seq1_removed = [] seq2_removed = [] initial_length = len(seq1) # Loop over all degenerate characters and check if they are present in # either sequence. for degen in degen_char: while degen in seq1: match_i = seq1.index(degen) seq1_removed.append(seq1[match_i]) seq1 = seq1[0:match_i] + seq1[match_i + 1:] seq2_removed.append(seq2[match_i]) seq2 = seq2[0:match_i] + seq2[match_i + 1:] while degen in seq2: match_i = seq2.index(degen) seq1_removed.append(seq1[match_i]) seq1 = seq1[0:match_i] + seq1[match_i + 1:] seq2_removed.append(seq2[match_i]) seq2 = seq2[0:match_i] + seq2[match_i + 1:] if len(seq1) > 0: nondegen_diff = DNA(seq1).distance(DNA(seq2)) * len(seq1) else: nondegen_diff = 0 # Initialize # of diff for degenerate sites. degen_diff = 0 # Loop over all degenerate sites and compare all options in each # sequence. # The # of differences at this site will be the proportion of # comparisons which differed. for i, degen_char in enumerate(seq1_removed): seq1_char_options = list(DNA(str(degen_char)).expand_degenerates()) seq2_char_options = list( DNA(str(seq2_removed[i])).expand_degenerates()) num_diff = 0 total_compare = 0 for seq1_opt in seq1_char_options: for seq2_opt in seq2_char_options: total_compare += 1 if seq1_opt != seq2_opt: num_diff += 1 degen_diff += num_diff / total_compare return ((nondegen_diff + degen_diff) / initial_length)
def main(): parser = argparse.ArgumentParser( description="Demultiplex gzipped FASTQ based on barcodes present in " "readnames (not in sequence). The metadata file should be " "tab-delimited with one column named \"SampleID\" and one " "column named \"BarcodeSequence\". The barcodes are " "assumed to be at the end of the read names, before " "\"/1\" or \"/2\" if the reads are paired-end.", epilog='''Usage example: python3 demult_barcode_readnames.py -f FASTQ -m METADATA -s data1_R1 -o \ OUTPUT_FOLDER ''', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-f", "--fastq", metavar="FASTQ", type=str, help="Path to gzipped FASTQ file", required=True) parser.add_argument("-m", "--meta", metavar="METADATA", type=str, help="Path to sample metadata file", required=True) parser.add_argument("-s", "--suffix", metavar="SUFFIX", type=str, help="String to append to the end of each " "output filename (before fastq.gz)", required=False) parser.add_argument("-o", "--output", metavar="OUTPUT_FOLDER", type=str, help="Output folder to write FASTQs", required=False, default="output_demult") parser.add_argument("-e", "--errors", metavar="FLOAT", type=float, help="Number of errors allowed in barcode", required=False, default=1.5) parser.add_argument("-r", "--revcomp", action="store_true", help="Flag to indicate that barcodes in " "sample_metadata file should be reverse " "complemented before matching.", required=False, default=False) parser.add_argument("--maxN", metavar="INT", type=int, help="Max number of N characters allowed in read " "barcode.", required=False, default=1) parser.add_argument("--force", action="store_true", help="Flag to indicate that command should be run " "even if output folder exists", required=False, default=False) args = parser.parse_args() # Check if output directory exists. if os.path.exists(args.output): if not args.force: sys.exit("Output directory exists and --force option not set so " "stopping.") else: os.makedirs(args.output) # Intitialize dict to keep track of all filehandles. sample_fh = {} # Set var to identify header of file. first_line = True # Read through sample metadata file and create filehandle for each barcode. with open(args.meta, "rt") as meta_in: for meta_line in meta_in: # Strip off line terminator and split on tabs. meta_line = meta_line.rstrip("\r\n") meta_line_split = meta_line.split("\t") # If line one then figure out which columns are SampleId and # BarcodeSequence if first_line: if "SampleID" in meta_line_split: sample_col = meta_line_split.index("SampleID") elif "#SampleID" in meta_line_split: sample_col = meta_line_split.index("#SampleID") else: sys.exit("No column named \"SampleID\" or \"\#SampleID\"" " in metadata file") if "BarcodeSequence" in meta_line_split: barcode_col = meta_line_split.index("BarcodeSequence") else: sys.exit("No column named \"BarcodeSequence\" in metadata" " file") first_line = False continue # Otherwise identify sample and barcode combo and open filehandle. sample = meta_line_split[sample_col] barcode = meta_line_split[barcode_col] # Take reverse complement of barcode if --revcomp set. if args.revcomp: barcode = str( DNA(barcode, validate=True, lowercase=True).reverse_complement()) outfile = sample + ".fastq.gz" if args.suffix: outfile = outfile + "_" + args.suffix outfile = os.path.join(args.output, outfile) sample_fh[barcode] = gzip.open(outfile, "wt") print("Writing reads for sample " + sample + " with barcode " + barcode + " to file " + outfile, file=sys.stderr) # Also open output file for reads which cannot be demultiplexed. unknown_out = "unknown.fastq.gz" if args.suffix: unknown_out = unknown_out + "_" + args.suffix unknown_out = os.path.join(args.output, unknown_out) sample_fh["unknown"] = gzip.open(unknown_out, "wt") print("Writing reads with unknown barcode to " + unknown_out, file=sys.stderr) # Check that all barcodes are the same length. barcode_lengths = set() for b in sample_fh.keys(): if b == "unknown": continue barcode_lengths.add(len(b)) if len(barcode_lengths) > 1: sys.exit("Error barcodes in metadata file are of varying lengths.") barcode_length = list(barcode_lengths)[0] # Initialize fastq line counter (every 4th line is header). fastq_lc = 4 # Read through FASTQ and demultiplex based on barcode matches. with gzip.open(args.fastq, 'rt') as fastq_in: for fastq_line in fastq_in: # If 4th line if fastq_lc == 4: last_barcode = None # Check if any barcode is within args$errors of this seq's # barcode. last_barcode = barcode_match(fastq_line, sample_fh.keys(), args.errors, barcode_length, args.maxN) fastq_lc = 1 else: fastq_lc += 1 print(fastq_line, file=sample_fh[last_barcode], end='') # Loop through all files and close filehandles. for fh in sample_fh.values(): fh.close()
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_offset=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] if phred_offset is None: post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: phred_offset = 33 else: phred_offset = 64 if phred_offset == 33: check_header_match_f = check_header_match_180_or_later elif phred_offset == 64: check_header_match_f = check_header_match_pre180 else: raise ValueError("Invalid PHRED offset: %d" % phred_offset) # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set( [len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input # read min_per_read_length = min_per_read_length_fraction * \ len(fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data, read_data in izip( parse_fastq(fastq_barcode_f, strict=False, phred_offset=phred_offset), parse_fastq(fastq_read_f, strict=False, phred_offset=phred_offset)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index], read_data[header_index])): raise FastqParseError( "Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = str(DNA(barcode).rc()) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode( barcode, barcode_to_sample_id, barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id is None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, phred_quality_threshold, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError("Unknown quality filter result: %d" % quality_filter_result) continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = str(DNA(sequence).rc()) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id, seq_id, header, barcode, corrected_barcode, num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f is not None: log_str = format_split_libraries_fastq_log( count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f is not None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts, bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def get_primers(header, mapping_data): """ Returns lists of forward/reverse primer regular expression generators header: list of strings of header data. mapping_data: list of lists of mapping data Will raise error if either the LinkerPrimerSequence or ReversePrimer fields are not present """ if "LinkerPrimerSequence" in header: primer_ix = header.index("LinkerPrimerSequence") else: raise IndexError( ("Mapping file is missing LinkerPrimerSequence field.")) if "ReversePrimer" in header: rev_primer_ix = header.index("ReversePrimer") else: raise IndexError(("Mapping file is missing ReversePrimer field.")) iupac = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]', 'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]', 'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]' } raw_forward_primers = set([]) raw_forward_rc_primers = set([]) raw_reverse_primers = set([]) raw_reverse_rc_primers = set([]) for line in mapping_data: # Split on commas to handle pool of primers raw_forward_primers.update( [upper(primer).strip() for primer in line[primer_ix].split(',')]) raw_forward_rc_primers.update( [str(DNA(primer).rc()) for primer in raw_forward_primers]) raw_reverse_primers.update([ upper(primer).strip() for primer in line[rev_primer_ix].split(',') ]) raw_reverse_rc_primers.update( [str(DNA(primer).rc()) for primer in raw_reverse_primers]) if not raw_forward_primers: raise ValueError(("No forward primers detected in mapping file.")) if not raw_reverse_primers: raise ValueError(("No reverse primers detected in mapping file.")) # Finding the forward primers, or rc of reverse primers indicates forward # read. Finding the reverse primer, or rc of the forward primers, indicates # the reverse read, so these sets are merged. raw_forward_primers.update(raw_reverse_rc_primers) raw_reverse_primers.update(raw_forward_rc_primers) forward_primers = [] reverse_primers = [] for curr_primer in raw_forward_primers: forward_primers.append( compile(''.join([iupac[symbol] for symbol in curr_primer]))) for curr_primer in raw_reverse_primers: reverse_primers.append( compile(''.join([iupac[symbol] for symbol in curr_primer]))) return forward_primers, reverse_primers
def process_barcode_in_label(read1_data, read2_data, output_bc_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":"): """ Reads data from one or two fastq labels, writes output barcodes file. read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores, False if no read 2. output_bc_fastq: open output fastq filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. """ header_index = 0 # Check for char_delineator in sequence try: bc1_read = read1_data[header_index].split( char_delineator)[-1][0:bc1_len] # If there is an index error, it means the char_delineator wasn't found except IndexError: raise IndexError("Found sequence lacking character delineator. " "Sequence header %s, character delineator %s" % (read1_data[header_index], char_delineator)) # Create fake quality scores, using 6 here to match the existing qual fake # qual scores that were all F. bc1_qual = np.ones(len(bc1_read), dtype=np.int8) * 6 if rev_comp_bc1: bc1_read = str(DNA(bc1_read).rc()) if read2_data: bc2_read =\ read2_data[header_index].strip().split( char_delineator)[-1][0:bc2_len] bc2_qual = np.ones(len(bc2_read), dtype=np.int8) * 6 if rev_comp_bc2: bc2_read = str(DNA(bc2_read).rc()) else: bc2_read = "" bc2_qual = np.array([], dtype=np.int8) if not bc1_read and not bc2_read: raise ValueError("Came up with empty barcode sequence, please check " "character delineator with -s, and fastq label " "%s" % read1_data[header_index]) bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, np.hstack([bc1_qual, bc2_qual])) output_bc_fastq.write(bc_lines) return
def process_barcode_paired_stitched(read_data, output_bc_fastq, output_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq_out_not_oriented=None, switch_bc_order=False): """ Processes stitched barcoded reads, writes barcode, parsed stitched read read_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of read1 stitched data bc2_len: length of barcode to remove from end of read2 stitched data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq_out_not_oriented: Open filepath to write reads where primers can't be found when attempt_read_orientation is True. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. """ header_index = 0 sequence_index = 1 quality_index = 2 read_seq = read_data[sequence_index] read_qual = read_data[quality_index] found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: for curr_primer in forward_primers: if curr_primer.search(read_data[sequence_index]): found_primer_match = True break if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read_data[sequence_index]): read_seq = str(DNA(read_seq).rc()) read_qual = read_qual[::-1] found_primer_match = True break if not found_primer_match and attempt_read_orientation: output_bc = output_bc_not_oriented output_read = fastq_out_not_oriented else: output_bc = output_bc_fastq output_read = output_fastq bc_read1 = read_seq[0:bc1_len] bc_read2 = read_seq[-bc2_len:] bc_qual1 = read_qual[0:bc1_len] bc_qual2 = read_qual[-bc2_len:] if rev_comp_bc1: bc_read1 = str(DNA(bc_read1).rc()) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = str(DNA(bc_read2).rc()) bc_qual2 = bc_qual2[::-1] if switch_bc_order: bc_read1, bc_read2 = bc_read2, bc_read1 bc_qual1, bc_qual2 = bc_qual2, bc_qual1 bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, np.hstack([bc_qual1, bc_qual2])) output_bc.write(bc_lines) seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len]) output_read.write(seq_lines) return
def process_barcode_paired_end_data(read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq1_out_not_oriented=None, fastq2_out_not_oriented=None): """ Processes, writes paired-end barcode data, parsed sequences read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads 1 filepath output_fastq2: open output fastq reads 2 filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq1_out_not_oriented: Open filepath to write reads 1 where primers can't be found when attempt_read_orientation is True. fastq2_out_not_oriented: Open filepath to write reads 2 where primers can't be found when attempt_read_orientation is True. """ header_index = 0 sequence_index = 1 quality_index = 2 bc1_end = None bc2_end = None found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: # First check forward primers for curr_primer in forward_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True bc1_end = curr_primer.search( read1_data[sequence_index]).start() # self_add by liaoth break if curr_primer.search(read2_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True bc1_end = curr_primer.search( read2_data[sequence_index]).start() # self_add by liaoth break # Check reverse primers if forward primers not found if found_primer_match: for curr_primer in reverse_primers: # self_mod by liaoth if curr_primer.search(read1_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True bc2_end = curr_primer.search( read1_data[sequence_index]).start( ) # self_add by liaoth break if curr_primer.search(read2_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True bc2_end = curr_primer.search( read2_data[sequence_index]).start( ) # self_add by liaoth break #if reverse_primers.index(curr_primer) == 1: # import pdb;pdb.set_trace() found_primer_match = False else: read1 = read1_data read2 = read2_data if not found_primer_match and attempt_read_orientation: read1 = read1_data read2 = read2_data output_bc = output_bc_not_oriented output_read1 = fastq1_out_not_oriented output_read2 = fastq2_out_not_oriented else: output_bc = output_bc_fastq output_read1 = output_fastq1 output_read2 = output_fastq2 if bc1_end and bc2_end: # self_add by liaoth #print 'test successed' bc_read1 = read1[sequence_index][bc1_end - bc1_len: bc1_end] # self_add by liaoth bc_read2 = read2[sequence_index][bc2_end - bc2_len: bc2_end] # self_add by liaoth bc_qual1 = read1[quality_index][bc1_end - bc1_len:bc1_end] # self_add by liaoth bc_qual2 = read2[quality_index][bc2_end - bc2_len:bc2_end] # self_add by liaoth else: # self_add by liaoth bc_read1 = read1[sequence_index][0:bc1_len] bc_read2 = read2[sequence_index][0:bc2_len] bc_qual1 = read1[quality_index][0:bc1_len] bc_qual2 = read2[quality_index][0:bc2_len] if rev_comp_bc1: bc_read1 = str(DNA(bc_read1).rc()) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = str(DNA(bc_read2).rc()) bc_qual2 = bc_qual2[::-1] bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2, np.hstack([bc_qual1, bc_qual2])) output_bc.write(bc_lines) if found_primer_match and attempt_read_orientation: # self_add by liaoth seq1_lines = format_fastq_record(read1[header_index], read1[sequence_index][bc1_end:], read1[quality_index][bc1_end:]) output_read1.write(seq1_lines) seq2_lines = format_fastq_record(read2[header_index], read2[sequence_index][bc2_end:], read2[quality_index][bc2_end:]) output_read2.write(seq2_lines) else: # self_add by liaoth seq1_lines = format_fastq_record(read1[header_index], read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:]) output_read1.write(seq1_lines) seq2_lines = format_fastq_record(read2[header_index], read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:]) output_read2.write(seq2_lines) return
from skbio.alignment._pairwise import global_pairwise_align_nucleotide from skbio.sequence import DNA global_pairwise_align_nucleotide(DNA("GCAAAAGCTGGTATTAAAGT"), DNA("GCATATTACGTGGTGATTCAAGAGGCCTTCG"), 5, 1, 5, -2, penalize_terminal_gaps=True) from skbio import __version__ as v print(v)
def create_primer_regex_patterns(self, header, mapping_data): """ Returns lists of forward/reverse primer regular expression header: list of strings of header data. mapping_data: list of lists of mapping data Will raise error if either the LinkerPrimerSequence or ReversePrimer fields are not present """ import logging self.logger = logging.getLogger('_getprm_') if "LinkerPrimerSequence" in header: primer_ix = header.index("LinkerPrimerSequence") else: raise IndexError( ("Mapping file is missing LinkerPrimerSequence field.")) if "ReversePrimer" in header: rev_primer_ix = header.index("ReversePrimer") else: raise IndexError(("Mapping file is missing ReversePrimer field.")) raw_forward_primers = set([]) raw_reverse_primers = set([]) for line in mapping_data: # Split on commas to handle pool of primers raw_forward_primers.update([ upper(primer).strip() for primer in line[primer_ix].split(',') ]) # reverse primer were reverse complemented raw_reverse_primers.update([ upper(str(DNA(primer))) for primer in line[rev_primer_ix].split(',') ]) if not raw_forward_primers: self.logger.critical( "No forward primers detected in mapping file.") raise ValueError("No forward primers detected in mapping file.") if not raw_reverse_primers: self.logger.critical( "No reverse primers detected in mapping file.") raise ValueError("No reverse primers detected in mapping file.") forward_primers = [] forward_primers_rc = [] reverse_primers = [] reverse_primers_rc = [] for curr_primer in raw_forward_primers: forward_primers.append( compile(''.join([ self.iupac[symbol] for symbol in curr_primer[:self.search_length] ]))) forward_primers_rc.append( compile(''.join([ self.iupac[symbol] for symbol in self.reverse_complement( curr_primer[:self.search_length]) ]))) for curr_primer in raw_reverse_primers: reverse_primers.append( compile(''.join([ self.iupac[symbol] for symbol in curr_primer[:self.search_length] ]))) reverse_primers_rc.append( compile(''.join([ self.iupac[symbol] for symbol in self.reverse_complement( curr_primer[:self.search_length]) ]))) return forward_primers, forward_primers_rc, reverse_primers, reverse_primers_rc
from skbio.sequence import DNA CS_FILE = 'q2.data' GENOME_FILE = 'genome.data' control_sequence_list = [] genome_list = [] with open(CS_FILE, 'r') as f: control_sequence_list = f.readlines() with open(GENOME_FILE, 'r') as f: genome_list = f.readlines() a = DNA(control_sequence_list[0][:-1]) print(a)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) read_arguments_from_file = opts.read_arguments_from_file # these arguments can optionally be read from a file, reasoning is to # allow arguments that would span over hundreds of samples and would be # prohibitive to execute as a command line call if read_arguments_from_file: # sample_ids is the only one of these arguments that's returned as a # string, the rest of them are lists if opts.sample_ids: opts.sample_ids = ','.join(parse_items(opts.sample_ids)) if opts.sequence_read_fps: opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0]) if opts.barcode_read_fps: opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0]) if opts.mapping_fps: opts.mapping_fps = parse_items(opts.mapping_fps[0]) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_offset = int(phred_offset) except ValueError: # shouldn't be able to get here... option_parser.error( "If --phred_offset is provided, it must be a valid integer.") if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 < min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be greater ' 'than 0 and less than or equal to 1. You passed ' '%1.5f.' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write(format_fastq_record(h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_offset=phred_offset) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_offset=phred_offset) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def main(): parser = argparse.ArgumentParser( description="Slice out amplified region of gene based on forward " "and reverse primers, which can contain degenerate bases.", epilog='''Usage example: python3 slice_amplified_region.py -i FASTA -f ACGCGHNRAACCTTACC -r ACGGGCRGTGWGTRCAA -o OUT_FASTA ''', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-i", "--input", metavar="IN_FASTA", type=str, help="Path to input FASTA", required=True) parser.add_argument("-o", "--output", metavar="OUT_FASTA", type=str, help="Path to output FASTA", required=True) parser.add_argument("-f", "--forward", metavar="FORWARD_PRIMER", type=str, help="Forward primer sequence.", required=True) parser.add_argument("-r", "--reverse", metavar="REVERSE_PRIMER", type=str, help="Reverse primer sequence.", required=True) parser.add_argument("--no_primer", action="store_true", help="Flag to indicate that primers should be removed" "in output sequences.", required=False) args = parser.parse_args() input_fasta = read_fasta(args.input) out_fasta = open(args.output, "w") # Get reverse complement of reverse primer: args.reverse = str(DNA(args.reverse).reverse_complement()) for seq in input_fasta.keys(): # Figure out where forward and reverse primers match. forward_start = seq_match_start(input_fasta[seq], args.forward) reverse_start = seq_match_start(input_fasta[seq], args.reverse) if forward_start == "multiple": print("Skipping", seq, "due to multiple matches of forward primer", sep=" ", file=sys.stderr) continue elif reverse_start == "multiple": print("Skipping", seq, "due to multiple matches of reverse primer", sep=" ", file=sys.stderr) continue elif forward_start is None: print("Forward primer not found in", seq, sep=" ", file=sys.stderr) continue elif reverse_start is None: print("Reverse primer not found in", seq, sep=" ", file=sys.stderr) continue elif forward_start > reverse_start: print("Forward primer matches after reverse in", seq, sep=" ", file=sys.stderr) continue if args.no_primer: amplified_slice = input_fasta[seq][forward_start + len(args.forward):reverse_start] else: amplified_slice = input_fasta[seq][forward_start:reverse_start + len(args.reverse)] print(seq, file=out_fasta) print(amplified_slice, file=out_fasta) out_fasta.close()
def krtd(seq, k, overlap=True, reverse_complement=False, return_full_dict=False, metrics=None): """Calculates the :math:`k`-mer return time distribution for a sequence. Args: seq (~skbio.sequence.DNA or str): The sequence to analyze. k (int): The :math:`k` value to use. overlap (bool, optional): Whether the :math:`k`-mers should overlap. Defaults to True. reverse_complement (bool, optional): Whether to calculate distances between a :math:`k`-mer and its next occurrence or the distances between :math:`k`-mers and their reverse complements. return_full_dict (bool, optional): Whether to return a full dictionary containing every :math:`k`-mer and its RTD. For large values of :math:`k`, as the sparsity of the space in creased, returning a full dictionary may be very slow. If False, returns a :obj:`~collections.defaultdict`. Functionally, this should be identical to a full dictionary if accessing dictionary elements. Defaults to False. metrics (list): A list of functions which, if passed, will be applied to each RTD array. Warning: Setting ``return_full_dict=True`` will take exponentially more time and as ``k`` increases. Returns: dict: A dictionary of the shape ``{k_mer: distances}`` in which ``k_mer`` is a str and distances is a :obj:`~numpy.ndarray`. If ``metrics`` is passed, the values of the dictionary will be dictionaries mapping each function to its value (see examples below). Raises: ValueError: When the sequence is degenerate. Examples: .. runblock:: pycon >>> from krtd import krtd # ignore >>> from pprint import pprint as print # for prettier printing # ignore >>> import numpy as np # ignore >>> print(krtd("ATGCACAGTTCAGA", 1)) >>> print(krtd("ATGCACAGTTCAGA", 1, metrics=[np.mean, np.std])) >>> print(krtd("ATGCACAGTTCAGA", 2, reverse_complement=True)) >>> print(krtd("ATGATTGGATATTATGAGGA", 1)) # no value for "C" is printed since it's not in the original sequence >>> print(krtd("ATGATTGGATATTATGAGGA", 1, return_full_dict=True)) # now it is """ # convert to DNA object if not isinstance(seq, DNA): seq = DNA(seq) if seq.has_degenerates(): raise ValueError("RTD for sequences with degenerates is undefined.") seq = seq_to_array(seq, k=k, overlap=overlap) result = {} # only calculate RTDs of k-mers present in the seq, which is nice as sparsity increases for k_mer in np.unique(seq): dists = distance_between_occurrences( seq, k_mer, k_mer if not reverse_complement else DNA(k_mer).reverse_complement(), overlap=overlap, ) if metrics: dists = _analyze_rtd(dists, metrics) result[k_mer] = dists # fill in the result dictionary (expensive!) if return_full_dict: for k_mer in ("".join(_k_mer) for _k_mer in itertools.product("ATGC", repeat=k)): if k_mer not in result: dists = np.empty(0, dtype="int64") if metrics: dists = _analyze_rtd(dists, metrics) result[k_mer] = dists return result
def match_seqs(seq1, seq2): '''Determine whether two sequences of the same length and possibly containing degenerate bases match.''' # List of degen DNA characters. degen_char = ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"] # If no degenerate characters in seqs then quickly determine if the # sequences are the same. if not any(degen in seq1 + seq2 for degen in degen_char): if seq1 == seq2: return (True) else: return (False) # List that will contain all degen characters to be compared separately # along with the corresponding nucleotide of the other sequence, which # isn't necessarily degenerate. seq1_removed = [] seq2_removed = [] # Loop over all degenerate characters and check if they are present in # either sequence. for degen in degen_char: while degen in seq1: match_i = seq1.index(degen) seq1_removed.append(seq1[match_i]) seq1 = seq1[0:match_i] + seq1[match_i + 1:] seq2_removed.append(seq2[match_i]) seq2 = seq2[0:match_i] + seq2[match_i + 1:] while degen in seq2: match_i = seq2.index(degen) seq1_removed.append(seq1[match_i]) seq1 = seq1[0:match_i] + seq1[match_i + 1:] seq2_removed.append(seq2[match_i]) seq2 = seq2[0:match_i] + seq2[match_i + 1:] # Return False if seqs don't match after removing dengenerate bases. if len(seq1) > 0: if seq1 != seq2: return (False) # Loop over all degenerate sites and compare all options in each # sequence. # The # of differences at this site will be the proportion of # comparisons which differed. for i, degen_char in enumerate(seq1_removed): seq1_char_options = list(DNA(str(degen_char)).expand_degenerates()) seq2_char_options = list( DNA(str(seq2_removed[i])).expand_degenerates()) degen_match = False for seq1_opt in seq1_char_options: for seq2_opt in seq2_char_options: if seq1_opt == seq2_opt: degen_match = True if not degen_match: return (False) return (True)