from wub.util import seq as seq_util # Parse command line arguments: parser = argparse.ArgumentParser( description='Reverse (but not complement!) sequences and qualities in fastq file.') parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)', type=argparse.FileType('w'), default=sys.stdout) def reverse_seq_records(input_iterator): """Reverse SeqRecord objects. :param input_iterator: Iterator of SeqRecord objects. :returns: Generator of reversed SeqRecord objects. :rtype: generator """ for record in input_iterator: yield record[::-1] if __name__ == '__main__': args = parser.parse_args() input_iterator = seq_util.read_seq_records( args.input_fastq, format='fastq') output_iterator = reverse_seq_records(input_iterator) seq_util.write_seq_records( output_iterator, args.output_fastq, format='fastq')
# Read in chromosomes of the input genome: chromosomes = list(seq_util.read_seq_records(args.input_fasta)) # Process error weights: error_weights = np.array(parse_util.separated_list_to_floats(args.w)) # Normalise error weights to probabilities: error_weights = parse_util.normalise_array(error_weights) error_weights = dict( zip(['substitution', 'insertion', 'deletion'], error_weights)) sw = None if args.s is not None: sw = sam_writer.SamWriter(args.s, build_sam_header(chromosomes)) simulation_iterator = simulate_sequencing(chromosomes, args.m, args.a, args.l, args.u, args.e, error_weights, args.b, args.q, args.n, sw) if not args.Q and args.output_fastq != sys.stdout: simulation_iterator = tqdm.tqdm(simulation_iterator, total=args.n) for simmed, sam in simulation_iterator: seq_util.write_seq_records(simmed, args.output_fastq, format='fastq') if sw is not None: sw.write(sam) if sw is not None: sw.close()
type=argparse.FileType('w'), default=sys.stdout) def record_filter(input_iter, in_format, read_names): """ Filter SeqRecord objects by length and mean quality. :param input_iter: Iterator of SeqRecord objects. :param in_format: Input format. :param to_alphabet: Convert to this alphabet. :returns: SeqRecord object. :rtype: generator """ for record in input_iter: if record.id in read_names: yield record if __name__ == '__main__': args = parser.parse_args() input_iterator = seq_util.read_seq_records(args.input_fastx, format=args.i) names = args.n.split(',') output_iterator = record_filter(input_iterator, args.i, names) seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)
:param error_weights: Relative frequency of substitutions,insertions,deletions. :returns: Generator of SeqRecord objects. :rtype: generator """ for record in input_iter: mutated_seq = sim_seq.simulate_sequencing_errors(record.seq, error_rate, error_weights).seq record.seq = Seq(mutated_seq) yield record if __name__ == '__main__': args = parser.parse_args() # Set random seed: if args.z is not None: np.random.seed(args.z) # Process error weights: error_weights = np.array(parse_util.separated_list_to_floats(args.w)) # Normalise error weights to probabilities: error_weights = parse_util.normalise_array(error_weights) error_weights = dict( zip(['substitution', 'insertion', 'deletion'], error_weights)) input_iterator = seq_util.read_seq_records(args.input_fasta, format='fasta') simulation_iterator = simulate_errors(input_iterator, args.e, error_weights) seq_util.write_seq_records( simulation_iterator, args.output_fasta, format='fasta')
if __name__ == '__main__': args = parser.parse_args() input_iterator = seq_util.read_seq_records(args.input_fastx, format=args.i) if args.b is None: # Splitting one record per file: for record in input_iterator: bn = path.basename(args.input_fastx.name) ext = bn.rsplit('.', 1)[-1] fh = open( path.join(args.output_dir, "{}.{}".format(record.id, ext)), 'w') seq_util.write_seq_records([record], fh, format=args.o) fh.flush() fh.close() else: # Split into batches: input_iterator = batch_iterator(input_iterator, args.b) i = 0 for records in input_iterator: bn = path.basename(args.input_fastx.name) fh = open(path.join(args.output_dir, "batch_{}_{}".format(i, bn)), 'w') seq_util.write_seq_records(records, fh, format=args.o) fh.flush() fh.close() i += 1