def test_write_fastq2(fastq_file, tmpdir): header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True)) file_name = (tmpdir / 'test.fq').strpath file_handle = open_file(file_name, 'w') fastq.write_fastq_sequence(file_handle, header, seq, qual) file_handle.close() headerw, seqw, qualw = next(fastq.load_fastq(file_name, num_qual=True)) assert (header, seq, list(qual)) == (headerw, seqw, list(qualw))
def fq_sync_command(verbose, master_file, input_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) master_file = load_fastq(master_file, num_qual=False) master_header = next(master_file)[0] header_type = choose_header_type(master_header) written_count = 0 for header, seq, qual in load_fastq(input_file, num_qual=False): if compare_header(master_header, header, header_type): write_fastq_sequence(output_file, header, seq, qual) written_count += 1 try: master_header = next(master_file)[0] except StopIteration: break LOG.info("Wrote %d FASTQ sequences", written_count)
def sort(verbose, mate1_input, mate2_input, mate1_output, mate2_output): "Sort two fastq files" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing [mate1-output] to file (%s)', getattr(mate1_output, 'name', repr(mate1_output))) LOG.info('Writing [mate2-output] to file (%s)', getattr(mate2_output, 'name', repr(mate2_output))) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for (seq_id1, seq1, qual1), (seq_id2, seq2, qual2) in zip(load_fastq(mate1_input), load_fastq(mate2_input)): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id1) if regex is None: simple_header = True LOG.info("Using a simple header structure") if simple_header: key1 = seq_id1[:-1] key2 = seq_id2[:-1] else: match1 = regex.search(seq_id1) match2 = regex.search(seq_id2) key1 = (match1.group('lane'), match1.group('tile'), match1.group('xcoord'), match1.group('ycoord')) key2 = (match2.group('lane'), match2.group('tile'), match2.group('xcoord'), match2.group('ycoord')) seq1 = (seq_id1, seq1, qual1) seq2 = (seq_id2, seq2, qual2) if key1 == key2: # if the 2 write_fastq_sequence(mate1_output, *seq1) write_fastq_sequence(mate2_output, *seq2) wcount += 1 report_counts(count, wcount, count) continue mate1[key1] = seq1 mate2[key2] = seq2 if key1 in mate2: write_fastq_sequence(mate1_output, *mate1[key1]) write_fastq_sequence(mate2_output, *mate2[key1]) del mate1[key1] del mate2[key1] wcount += 1 if key2 in mate1: write_fastq_sequence(mate1_output, *mate1[key2]) write_fastq_sequence(mate2_output, *mate2[key2]) del mate1[key2] del mate2[key2] wcount += 1 report_counts(count, wcount, count) report_counts(count, wcount, None)
def rand_sequence_command(verbose, num_seqs, gc_content, infer_params, coding_prop, length, const_model, dist_loc, fastq, save_model, read_model, progress, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if fastq: # default values, unless infer_parameters is used min_qual = 0 max_qual = 60 if const_model: LOG.info("Using constant model with loc=%.1f", dist_loc) model = sequence.qualities_model_constant(length=length, loc=dist_loc) elif infer_params: length, gc_content, model = infer_parameters( infer_params, fastq, progress) min_qual, max_qual = model[2:] model = model[:2] elif read_model: LOG.info('Reading saved model') read_model = pickle.load(read_model) gc_content = read_model['gc_content'] lw = read_model['lw'] length = len(lw) model = (lw, getattr(scipy.stats, read_model['dist_family'])(*read_model['dist'])) # tries to read the min/max quality params, otherwise keep defaults try: min_qual = read_model['min_qual'] max_qual = read_model['max_qual'] except KeyError: pass else: LOG.info("Using decrease model with loc=%.1f", dist_loc) model = sequence.qualities_model_decrease(length=length, loc=dist_loc) if save_model is not None: LOG.info('Saving model to file (%s)', getattr(save_model, 'name', repr(save_model))) pickle.dump( dict(lw=model[0], dist=model[1].args, dist_family='norm', gc_content=gc_content, max_qual=max_qual, min_qual=min_qual), save_model) # A C T G prob = [(1 - gc_content) / 2., gc_content / 2.] * 2 LOG.info('%d Sequences, with a length of %d - coding proportion: %.1f', num_seqs, length, coding_prop) LOG.info("Probability A %.2f, C %.2f, T %.2f, G %.2f", *prob) num_coding = numpy.round(num_seqs * coding_prop).astype(int) seq_it = itertools.chain( sequence.random_sequences_codon(n=num_coding, length=length), sequence.random_sequences(n=num_seqs - num_coding, length=length, p=prob)) if fastq: qual_it = sequence.random_qualities( n=num_seqs, length=length, model=model, max_qual=max_qual, min_qual=min_qual, ) else: qual_it = itertools.repeat(num_seqs) if progress: qual_it = tqdm(qual_it, total=num_seqs) for seq, qual in zip(seq_it, qual_it): seq_id = str(uuid.uuid4()) if fastq: write_fastq_sequence(output_file, seq_id, seq, qual) else: fasta.write_fasta_sequence(output_file, seq_id, seq)
def deinterleave(verbose, strip, fastq_file, mate1_file, mate2_file): "Deinterleave a fastq file" mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing [mate1-file] to file (%s)', getattr(mate1_file, 'name', repr(mate1_file))) LOG.info('Writing [mate2-file] to file (%s)', getattr(mate2_file, 'name', repr(mate2_file))) regex = None simple_header = False mate1 = {} mate2 = {} count = 0 wcount = 0 for seq_id, seq, qual in load_fastq(fastq_file): count += 1 if (regex is None) and (not simple_header): regex = choose_header_type(seq_id) if regex is None: LOG.info("Using a simple header structure") simple_header = True if simple_header: key = seq_id[:-1] mate = int(seq_id[-1]) else: match = regex.search(seq_id) key = (match.group('lane'), match.group('tile'), match.group('xcoord'), match.group('ycoord')) mate = int(match.group('mate')) if strip: sequence_name = seq_id.split('\t')[0] else: sequence_name = seq_id if mate == 1: mate1[key] = (sequence_name, seq, qual) else: mate2[key] = (sequence_name, seq, qual) try: # if sequence header in both seq1 = mate1[key] seq2 = mate2[key] write_fastq_sequence(mate1_file, *seq1) write_fastq_sequence(mate2_file, *seq2) wcount += 2 del mate1[key] del mate2[key] except KeyError: pass report_counts(count, wcount, count) report_counts(count, wcount, None)