예제 #1
0
def test_write_fastq2(fastq_file, tmpdir):

    header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True))

    file_name = (tmpdir / 'test.fq').strpath

    file_handle = open_file(file_name, 'w')

    fastq.write_fastq_sequence(file_handle, header, seq, qual)
    file_handle.close()

    headerw, seqw, qualw = next(fastq.load_fastq(file_name, num_qual=True))

    assert (header, seq, list(qual)) == (headerw, seqw, list(qualw))
예제 #2
0
def infer_parameters(file_handle, fastq_bool, progress):
    LOG.info("Extrapolating model from file %s", file_handle.name)

    if fastq_bool:
        it = load_fastq(file_handle, num_qual=True)
        quals = []
    else:
        it = fasta.load_fasta(file_handle)

    if progress:
        it = tqdm(it)

    gc_content = []

    length = 0

    for record in it:
        length = max(length, len(record[1]))
        gc_content.append(sequence.sequence_gc_content(record[1]))
        if fastq_bool:
            quals.append(record[2])

    if fastq_bool:
        model = sequence.extrapolate_model(quals)
    else:
        model = None

    gc_content = numpy.mean(gc_content)

    return length, gc_content, model
예제 #3
0
def convert_command(verbose, fastq_file, fasta_file):
    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)
    LOG.info("Writing FASTA file (%s)",
             getattr(fasta_file, 'name', repr(fasta_file)))

    for seq_id, seq, qual in load_fastq(fastq_file):
        fasta.write_fasta_sequence(fasta_file, seq_id, seq)
예제 #4
0
def test_load_fastq4(fastq_file):

    header, seq, qual = next(fastq.load_fastq(fastq_file, num_qual=True))

    assert list(qual) == [
        24, 34, 26, 28, 26, 28, 27, 24, 31, 19, 23, 21, 23, 29, 24, 25, 21, 22,
        32, 32, 27, 24, 29, 21, 20, 27, 28, 29, 20, 24, 16
    ]
예제 #5
0
def fq_sync_command(verbose, master_file, input_file, output_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    master_file = load_fastq(master_file, num_qual=False)
    master_header = next(master_file)[0]

    header_type = choose_header_type(master_header)

    written_count = 0

    for header, seq, qual in load_fastq(input_file, num_qual=False):

        if compare_header(master_header, header, header_type):
            write_fastq_sequence(output_file, header, seq, qual)
            written_count += 1
            try:
                master_header = next(master_file)[0]
            except StopIteration:
                break

    LOG.info("Wrote %d FASTQ sequences", written_count)
예제 #6
0
def test_load_fastq4(fastq_file):

    header, seq, qual = next(fastq.load_fastq(fastq_file))

    assert qual == '9C;=;=<9@4868>9:67AA<9>65<=>591'
예제 #7
0
def test_load_fastq3(fastq_file):

    header, seq, qual = next(fastq.load_fastq(fastq_file))

    assert header == 'cluster_2:UMI_ATTCCG'
예제 #8
0
def test_load_fastq2(fastq_file):

    header, seq, qual = next(fastq.load_fastq(fastq_file))

    assert seq == 'TTTCCGGGGCACATAATCTTCAGCCGGGCGC'
예제 #9
0
def test_load_fastq1(fastq_file):

    assert sum(1 for record in fastq.load_fastq(fastq_file)) == 250
예제 #10
0
def sort(verbose, mate1_input, mate2_input, mate1_output, mate2_output):
    "Sort two fastq files"

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing [mate1-output] to file (%s)',
             getattr(mate1_output, 'name', repr(mate1_output)))
    LOG.info('Writing [mate2-output] to file (%s)',
             getattr(mate2_output, 'name', repr(mate2_output)))

    regex = None
    simple_header = False

    mate1 = {}
    mate2 = {}

    count = 0
    wcount = 0

    for (seq_id1, seq1, qual1), (seq_id2, seq2,
                                 qual2) in zip(load_fastq(mate1_input),
                                               load_fastq(mate2_input)):

        count += 1

        if (regex is None) and (not simple_header):
            regex = choose_header_type(seq_id1)
            if regex is None:
                simple_header = True
                LOG.info("Using a simple header structure")

        if simple_header:
            key1 = seq_id1[:-1]
            key2 = seq_id2[:-1]
        else:
            match1 = regex.search(seq_id1)
            match2 = regex.search(seq_id2)

            key1 = (match1.group('lane'), match1.group('tile'),
                    match1.group('xcoord'), match1.group('ycoord'))
            key2 = (match2.group('lane'), match2.group('tile'),
                    match2.group('xcoord'), match2.group('ycoord'))

        seq1 = (seq_id1, seq1, qual1)
        seq2 = (seq_id2, seq2, qual2)

        if key1 == key2:
            # if the 2
            write_fastq_sequence(mate1_output, *seq1)
            write_fastq_sequence(mate2_output, *seq2)
            wcount += 1
            report_counts(count, wcount, count)
            continue

        mate1[key1] = seq1
        mate2[key2] = seq2

        if key1 in mate2:
            write_fastq_sequence(mate1_output, *mate1[key1])
            write_fastq_sequence(mate2_output, *mate2[key1])
            del mate1[key1]
            del mate2[key1]
            wcount += 1
        if key2 in mate1:
            write_fastq_sequence(mate1_output, *mate1[key2])
            write_fastq_sequence(mate2_output, *mate2[key2])
            del mate1[key2]
            del mate2[key2]
            wcount += 1

        report_counts(count, wcount, count)

    report_counts(count, wcount, None)
예제 #11
0
def deinterleave(verbose, strip, fastq_file, mate1_file, mate2_file):
    "Deinterleave a fastq file"

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing [mate1-file] to file (%s)',
             getattr(mate1_file, 'name', repr(mate1_file)))
    LOG.info('Writing [mate2-file] to file (%s)',
             getattr(mate2_file, 'name', repr(mate2_file)))

    regex = None
    simple_header = False

    mate1 = {}
    mate2 = {}

    count = 0
    wcount = 0

    for seq_id, seq, qual in load_fastq(fastq_file):

        count += 1

        if (regex is None) and (not simple_header):
            regex = choose_header_type(seq_id)
            if regex is None:
                LOG.info("Using a simple header structure")
                simple_header = True

        if simple_header:
            key = seq_id[:-1]
            mate = int(seq_id[-1])
        else:
            match = regex.search(seq_id)
            key = (match.group('lane'), match.group('tile'),
                   match.group('xcoord'), match.group('ycoord'))
            mate = int(match.group('mate'))

        if strip:
            sequence_name = seq_id.split('\t')[0]
        else:
            sequence_name = seq_id

        if mate == 1:
            mate1[key] = (sequence_name, seq, qual)
        else:
            mate2[key] = (sequence_name, seq, qual)

        try:
            # if sequence header in both
            seq1 = mate1[key]
            seq2 = mate2[key]
            write_fastq_sequence(mate1_file, *seq1)
            write_fastq_sequence(mate2_file, *seq2)
            wcount += 2
            del mate1[key]
            del mate2[key]
        except KeyError:
            pass

        report_counts(count, wcount, count)

    report_counts(count, wcount, None)