示例#1
0
def test_write_fastq2(fastq_file, tmpdir):

    header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True))

    file_name = (tmpdir / 'test.fq').strpath

    file_handle = open_file(file_name, 'w')

    fastq.write_fastq_sequence(file_handle, header, seq, qual)
    file_handle.close()

    headerw, seqw, qualw = next(fastq.load_fastq(file_name, num_qual=True))

    assert (header, seq, list(qual)) == (headerw, seqw, list(qualw))
示例#2
0
def fq_sync_command(verbose, master_file, input_file, output_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    master_file = load_fastq(master_file, num_qual=False)
    master_header = next(master_file)[0]

    header_type = choose_header_type(master_header)

    written_count = 0

    for header, seq, qual in load_fastq(input_file, num_qual=False):

        if compare_header(master_header, header, header_type):
            write_fastq_sequence(output_file, header, seq, qual)
            written_count += 1
            try:
                master_header = next(master_file)[0]
            except StopIteration:
                break

    LOG.info("Wrote %d FASTQ sequences", written_count)
示例#3
0
def sort(verbose, mate1_input, mate2_input, mate1_output, mate2_output):
    "Sort two fastq files"

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing [mate1-output] to file (%s)',
             getattr(mate1_output, 'name', repr(mate1_output)))
    LOG.info('Writing [mate2-output] to file (%s)',
             getattr(mate2_output, 'name', repr(mate2_output)))

    regex = None
    simple_header = False

    mate1 = {}
    mate2 = {}

    count = 0
    wcount = 0

    for (seq_id1, seq1, qual1), (seq_id2, seq2,
                                 qual2) in zip(load_fastq(mate1_input),
                                               load_fastq(mate2_input)):

        count += 1

        if (regex is None) and (not simple_header):
            regex = choose_header_type(seq_id1)
            if regex is None:
                simple_header = True
                LOG.info("Using a simple header structure")

        if simple_header:
            key1 = seq_id1[:-1]
            key2 = seq_id2[:-1]
        else:
            match1 = regex.search(seq_id1)
            match2 = regex.search(seq_id2)

            key1 = (match1.group('lane'), match1.group('tile'),
                    match1.group('xcoord'), match1.group('ycoord'))
            key2 = (match2.group('lane'), match2.group('tile'),
                    match2.group('xcoord'), match2.group('ycoord'))

        seq1 = (seq_id1, seq1, qual1)
        seq2 = (seq_id2, seq2, qual2)

        if key1 == key2:
            # if the 2
            write_fastq_sequence(mate1_output, *seq1)
            write_fastq_sequence(mate2_output, *seq2)
            wcount += 1
            report_counts(count, wcount, count)
            continue

        mate1[key1] = seq1
        mate2[key2] = seq2

        if key1 in mate2:
            write_fastq_sequence(mate1_output, *mate1[key1])
            write_fastq_sequence(mate2_output, *mate2[key1])
            del mate1[key1]
            del mate2[key1]
            wcount += 1
        if key2 in mate1:
            write_fastq_sequence(mate1_output, *mate1[key2])
            write_fastq_sequence(mate2_output, *mate2[key2])
            del mate1[key2]
            del mate2[key2]
            wcount += 1

        report_counts(count, wcount, count)

    report_counts(count, wcount, None)
示例#4
0
def rand_sequence_command(verbose, num_seqs, gc_content, infer_params,
                          coding_prop, length, const_model, dist_loc, fastq,
                          save_model, read_model, progress, output_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    if fastq:
        # default values, unless infer_parameters is used
        min_qual = 0
        max_qual = 60
        if const_model:
            LOG.info("Using constant model with loc=%.1f", dist_loc)
            model = sequence.qualities_model_constant(length=length,
                                                      loc=dist_loc)
        elif infer_params:
            length, gc_content, model = infer_parameters(
                infer_params, fastq, progress)
            min_qual, max_qual = model[2:]
            model = model[:2]
        elif read_model:
            LOG.info('Reading saved model')
            read_model = pickle.load(read_model)
            gc_content = read_model['gc_content']
            lw = read_model['lw']
            length = len(lw)
            model = (lw,
                     getattr(scipy.stats,
                             read_model['dist_family'])(*read_model['dist']))
            # tries to read the min/max quality params, otherwise keep defaults
            try:
                min_qual = read_model['min_qual']
                max_qual = read_model['max_qual']
            except KeyError:
                pass
        else:
            LOG.info("Using decrease model with loc=%.1f", dist_loc)
            model = sequence.qualities_model_decrease(length=length,
                                                      loc=dist_loc)

        if save_model is not None:
            LOG.info('Saving model to file (%s)',
                     getattr(save_model, 'name', repr(save_model)))
            pickle.dump(
                dict(lw=model[0],
                     dist=model[1].args,
                     dist_family='norm',
                     gc_content=gc_content,
                     max_qual=max_qual,
                     min_qual=min_qual), save_model)

    # A C T G
    prob = [(1 - gc_content) / 2., gc_content / 2.] * 2

    LOG.info('%d Sequences, with a length of %d - coding proportion: %.1f',
             num_seqs, length, coding_prop)
    LOG.info("Probability A %.2f, C %.2f, T %.2f, G %.2f", *prob)

    num_coding = numpy.round(num_seqs * coding_prop).astype(int)
    seq_it = itertools.chain(
        sequence.random_sequences_codon(n=num_coding, length=length),
        sequence.random_sequences(n=num_seqs - num_coding,
                                  length=length,
                                  p=prob))
    if fastq:
        qual_it = sequence.random_qualities(
            n=num_seqs,
            length=length,
            model=model,
            max_qual=max_qual,
            min_qual=min_qual,
        )
    else:
        qual_it = itertools.repeat(num_seqs)

    if progress:
        qual_it = tqdm(qual_it, total=num_seqs)

    for seq, qual in zip(seq_it, qual_it):
        seq_id = str(uuid.uuid4())
        if fastq:
            write_fastq_sequence(output_file, seq_id, seq, qual)
        else:
            fasta.write_fasta_sequence(output_file, seq_id, seq)
示例#5
0
def deinterleave(verbose, strip, fastq_file, mate1_file, mate2_file):
    "Deinterleave a fastq file"

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing [mate1-file] to file (%s)',
             getattr(mate1_file, 'name', repr(mate1_file)))
    LOG.info('Writing [mate2-file] to file (%s)',
             getattr(mate2_file, 'name', repr(mate2_file)))

    regex = None
    simple_header = False

    mate1 = {}
    mate2 = {}

    count = 0
    wcount = 0

    for seq_id, seq, qual in load_fastq(fastq_file):

        count += 1

        if (regex is None) and (not simple_header):
            regex = choose_header_type(seq_id)
            if regex is None:
                LOG.info("Using a simple header structure")
                simple_header = True

        if simple_header:
            key = seq_id[:-1]
            mate = int(seq_id[-1])
        else:
            match = regex.search(seq_id)
            key = (match.group('lane'), match.group('tile'),
                   match.group('xcoord'), match.group('ycoord'))
            mate = int(match.group('mate'))

        if strip:
            sequence_name = seq_id.split('\t')[0]
        else:
            sequence_name = seq_id

        if mate == 1:
            mate1[key] = (sequence_name, seq, qual)
        else:
            mate2[key] = (sequence_name, seq, qual)

        try:
            # if sequence header in both
            seq1 = mate1[key]
            seq2 = mate2[key]
            write_fastq_sequence(mate1_file, *seq1)
            write_fastq_sequence(mate2_file, *seq2)
            wcount += 2
            del mate1[key]
            del mate2[key]
        except KeyError:
            pass

        report_counts(count, wcount, count)

    report_counts(count, wcount, None)