示例#1
0
    def test_parse_error(self):
        """Does this raise a FastqParseError with incorrect input?"""
        with self.assertRaises(FastqParseError):
            list(parse_fastq(self.FASTQ_EXAMPLE_2, strict=True))

        with self.assertRaises(FastqParseError):
            list(parse_fastq(self.FASTQ_EXAMPLE_3, phred_offset=64))
def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file.
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written
       to file.

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path, ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')

    # Set up iterators
    index_fastq_iter = parse_fastq(ih, strict=False)
    joined_fastq_iter = parse_fastq(jh, strict=False)
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label, joined_seq, joined_qual in joined_fastq_iter:
        index_label, index_seq, index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label, index_seq, index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration(
                    "\n\nReached end of index-reads file" +
                    " before iterating through joined paired-end-reads file!" +
                    " Except for missing paired-end reads that did not survive"
                    +
                    " assembly, your index and paired-end reads files must be in"
                    + " the same order! Also, check that the index-reads and" +
                    " paired-end reads have identical headers. The last joined"
                    + " paired-end ID processed was:\n\'%s\'\n" %
                    (joined_label))
        else:
            fastq_string = '@%s\n%s\n+\n%s\n'\
                % (index_label, index_seq, index_qual)
            fbc_fh.write(fastq_string)

    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path
示例#3
0
def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file.
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written
       to file.

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path, ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')

    # Set up iterators
    index_fastq_iter = parse_fastq(ih, strict=False)
    joined_fastq_iter = parse_fastq(jh, strict=False)
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label, joined_seq, joined_qual in joined_fastq_iter:
        index_label, index_seq, index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label, index_seq, index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration("\n\nReached end of index-reads file" +
                                    " before iterating through joined paired-end-reads file!" +
                                    " Except for missing paired-end reads that did not survive" +
                                    " assembly, your index and paired-end reads files must be in" +
                                    " the same order! Also, check that the index-reads and" +
                                    " paired-end reads have identical headers. The last joined" +
                                    " paired-end ID processed was:\n\'%s\'\n" % (joined_label))
        else:
            fastq_string = '@%s\n%s\n+\n%s\n'\
                % (index_label, index_seq, index_qual)
            fbc_fh.write(fastq_string)

    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path
示例#4
0
 def test_parse(self):
     """sequence and info objects should correctly match"""
     for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE,
                                         phred_offset=64):
         self.assertTrue(label in DATA)
         self.assertEqual(seq, DATA[label]["seq"])
         self.assertTrue((qual == DATA[label]["qual"]).all())
示例#5
0
def extract_reads_from_interleaved(
        input_fp, forward_id, reverse_id, output_dir):
    """Parses a single fastq file and creates two new files: forward and reverse, based on
    the two values (comma separated) in read_direction_identifiers

    input_fp: file path to input
    read_direction_identifiers: comma separated values to identify forward and reverse reads
    output_folder: file path to the output folder
    """
    forward_fp = join(output_dir, "forward_reads.fastq")
    reverse_fp = join(output_dir, "reverse_reads.fastq")
    ffp = open(forward_fp, 'w')
    rfp = open(reverse_fp, 'w')

    for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False):
        fastq_string = format_fastq_record(label, seq, qual)
        if forward_id in label:
            ffp.write(fastq_string)
        elif reverse_id in label and forward_id not in label:
            rfp.write(fastq_string)
        else:
            ffp.close()
            rfp.close()
            raise ValueError("One of the input sequences doesn't have either identifier "
                             "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" %
                             (label, forward_id, reverse_id))
    ffp.close()
    rfp.close()
示例#6
0
 def test_parse(self):
     """sequence and info objects should correctly match"""
     for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE,
                                         phred_offset=64):
         self.assertTrue(label in DATA)
         self.assertEqual(seq, DATA[label]["seq"])
         self.assertTrue((qual == DATA[label]["qual"]).all())
def extract_reads_from_interleaved(input_fp, forward_id, reverse_id,
                                   output_dir):
    """Parses a single fastq file and creates two new files: forward and reverse, based on
    the two values (comma separated) in read_direction_identifiers

    input_fp: file path to input
    read_direction_identifiers: comma separated values to identify forward and reverse reads
    output_folder: file path to the output folder
    """
    forward_fp = join(output_dir, "forward_reads.fastq")
    reverse_fp = join(output_dir, "reverse_reads.fastq")
    ffp = open(forward_fp, 'w')
    rfp = open(reverse_fp, 'w')

    for label, seq, qual in parse_fastq(qiime_open(input_fp),
                                        strict=False,
                                        enforce_qual_range=False):
        fastq_string = format_fastq_record(label, seq, qual)
        if forward_id in label:
            ffp.write(fastq_string)
        elif reverse_id in label and forward_id not in label:
            rfp.write(fastq_string)
        else:
            ffp.close()
            rfp.close()
            raise ValueError(
                "One of the input sequences doesn't have either identifier "
                "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" %
                (label, forward_id, reverse_id))
    ffp.close()
    rfp.close()
示例#8
0
def filter_fastq(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False,
                 seqid_f=None):
    """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep

        input_seqs can be the output of parse_fasta or parse_fastq
    """
    if seqid_f is None:
        seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0]
                                           for seq_id in seqs_to_keep])
        # Define a function based on the value of negate
        if not negate:
            def keep_seq(seq_id):
                return seq_id.split()[0] in seqs_to_keep_lookup
        else:
            def keep_seq(seq_id):
                return seq_id.split()[0] not in seqs_to_keep_lookup

    else:
        if not negate:
            keep_seq = seqid_f
        else:
            keep_seq = lambda x: not seqid_f(x)

    for seq_id, seq, qual in parse_fastq(input_seqs_f,
                                         enforce_qual_range=False):
        if keep_seq(seq_id):
            output_seqs_f.write(format_fastq_record(seq_id, seq, qual))
    output_seqs_f.close()
示例#9
0
def filter_fastq(input_seqs_f,
                 output_seqs_f,
                 seqs_to_keep,
                 negate=False,
                 seqid_f=None):
    """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep

        input_seqs can be the output of parse_fasta or parse_fastq
    """
    if seqid_f is None:
        seqs_to_keep_lookup = {}.fromkeys(
            [seq_id.split()[0] for seq_id in seqs_to_keep])
        # Define a function based on the value of negate
        if not negate:

            def keep_seq(seq_id):
                return seq_id.split()[0] in seqs_to_keep_lookup
        else:

            def keep_seq(seq_id):
                return seq_id.split()[0] not in seqs_to_keep_lookup

    else:
        if not negate:
            keep_seq = seqid_f
        else:
            keep_seq = lambda x: not seqid_f(x)

    for seq_id, seq, qual in parse_fastq(input_seqs_f,
                                         enforce_qual_range=False):
        if keep_seq(seq_id):
            output_seqs_f.write(format_fastq_record(seq_id, seq, qual))
    output_seqs_f.close()
示例#10
0
文件: utils.py 项目: danman0091/keds
def split_by_index(read1, read2, barcodes, bc_pos=(26,6)):
    '''
    Splits read pairs given in `read1` and `read2` according to the list of
    barcodes given in `barcode`. 

    The position and length of the barcode can be specified in `bc_pos` as a
    (start, length) tuple.
    '''
    output_files = {}
    # Read name line MUST start with @
    fastq_tpl = '@{id}\n{seq}\n+\n{q}\n' 
    cnt = 0
    assigned = 0
    for rec1, rec2 in izip(parse_fastq(read1), parse_fastq(read2)):
        id1, seq1, q1 = rec1
        id2, seq2, q2 = rec2
        cnt += 1
        if cnt % 1000000 == 0:
            print 'Processed\t %d records...' % cnt
            sys.stdout.flush()
        istart, ilen = bc_pos
        ind = seq1[istart:istart+ilen]
        # It's an exact match for now but we really need to accomodate
        # mismatches here. Hamming distance?
        if ind in barcodes:
            assigned += 1
            qstr1 = ''.join([chr(val+33) for val in q1])
            qstr2 = ''.join([chr(val+33) for val in q2])
            if not(ind in output_files):
                r1 = gzip.open('../data/%s_R1.fastq.gz' % ind, 'wb')
                r2 = gzip.open('../data/%s_R2.fastq.gz' % ind, 'wb')
                print '...created output files for: %s' % ind
                sys.stdout.flush()
                output_files[ind] = (r1, r2)
            output_files[ind][0].write(fastq_tpl.format(id=id1,seq=seq1,q=qstr1))
            output_files[ind][1].write(fastq_tpl.format(id=id2,seq=seq2,q=qstr2))
    print output_files.keys()
    print 'Assigned:\t%d sequences' % assigned
    for ind,files in output_files.items():
        f1, f2 = files
        f1.close()
        f2.close()
示例#11
0
    def test_parse(self):
        for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE,
                                            phred_offset=64):
            self.assertTrue(label in DATA)
            self.assertEqual(seq, DATA[label]["seq"])
            self.assertTrue((qual == DATA[label]["qual"]).all())

        # Make sure that enforce_qual_range set to False allows qual scores
        # to fall outside the typically acceptable range of 0-62
        for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE_2,
                                            phred_offset=33,
                                            enforce_qual_range=False):
            self.assertTrue(label in DATA_2)
            self.assertEqual(seq, DATA_2[label]["seq"])
            self.assertTrue((qual == DATA_2[label]["qual"]).all())

        # This should raise a FastqParseError since the qual scores are
        # intended to be interpreted with an offset of 64, and using 33 will
        # make the qual score fall outside the acceptable range of 0-62.
        with self.assertRaises(FastqParseError):
            list(parse_fastq(self.FASTQ_EXAMPLE, phred_offset=33))
示例#12
0
def fetch_study(study_accession, base_dir):
    """Fetch and dump a study

    Grab and dump a study.  If sample_accessions
    are specified, then only those specified samples

    will be fetched and dumped

    Parameters
    ----------
    study_accession : str
       Accession ID for the study
    base_dir : str
       Path of base directory to save the fetched results

    Note
    ----
    If sample_accession is None, then the entire study will be fetched
    """
    if ag.is_test_env():
        return 0

    study_dir = os.path.join(base_dir, study_accession)

    if ag.staged_raw_data() is not None:
        os.symlink(ag.staged_raw_data(), study_dir)
    elif not os.path.exists(study_dir):
        os.mkdir(study_dir)

    new_samples = 0

    for sample, fastq_url in fetch_study_details(study_accession):
        sample_dir = os.path.join(study_dir, sample)
        if not os.path.exists(sample_dir):
            # fetch files if it isn't already present
            os.mkdir(sample_dir)
            metadata_path = os.path.join(sample_dir,
                                         '%s.txt' % sample)
            fasta_path = os.path.join(sample_dir,
                                      '%s.fna' % sample)
            # write out fasta
            with open(fasta_path, 'w') as fasta_out:
                for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
                    fasta_out.write(">%s\n%s\n" % (id_, seq))
            # write mapping xml
            url_fmt = "http://www.ebi.ac.uk/ena/data/view/" + \
                      "%(accession)s&display=xml"
            res = fetch_url(url_fmt % {'accession': sample})
            with open(metadata_path, 'w') as md_f:
                md_f.write(res.read())

            new_samples += 1
    return new_samples
示例#13
0
def fetch_study(study_accession, base_dir):
    """Fetch and dump a study

    Grab and dump a study.  If sample_accessions
    are specified, then only those specified samples

    will be fetched and dumped

    Parameters
    ----------
    study_accession : str
       Accession ID for the study
    base_dir : str
       Path of base directory to save the fetched results

    Note
    ----
    If sample_accession is None, then the entire study will be fetched
    """
    if ag.is_test_env():
        return 0

    study_dir = os.path.join(base_dir, study_accession)

    if ag.staged_raw_data() is not None:
        os.symlink(ag.staged_raw_data(), study_dir)
    elif not os.path.exists(study_dir):
        os.mkdir(study_dir)

    new_samples = 0

    for sample, fastq_url in fetch_study_details(study_accession):
        sample_dir = os.path.join(study_dir, sample)
        if not os.path.exists(sample_dir):
            # fetch files if it isn't already present
            os.mkdir(sample_dir)
            metadata_path = os.path.join(sample_dir,
                                         '%s.txt' % sample)
            fasta_path = os.path.join(sample_dir,
                                      '%s.fna' % sample)
            # write out fasta
            with open(fasta_path, 'w') as fasta_out:
                for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
                    fasta_out.write(">%s\n%s\n" % (id_, seq))
            # write mapping xml
            url_fmt = "http://www.ebi.ac.uk/ena/data/view/" + \
                      "%(accession)s&display=xml"
            res = fetch_url(url_fmt % {'accession': sample})
            with open(metadata_path, 'w') as md_f:
                md_f.write(res.read())

            new_samples += 1
    return new_samples
示例#14
0
    def test_parse(self):
        """sequence and info objects should correctly match"""
        for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE,
                                            phred_offset=64):
            self.assertTrue(label in DATA)
            self.assertEqual(seq, DATA[label]["seq"])
            self.assertTrue((qual == DATA[label]["qual"]).all())

        # Make sure that enforce_qual_range set to False allows qual scores
        # to fall outside the typically acceptable range of 0-62
        for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE_2,
                                            phred_offset=33,
                                            enforce_qual_range=False):
            self.assertTrue(label in DATA_2)
            self.assertEqual(seq, DATA_2[label]["seq"])
            self.assertTrue((qual == DATA_2[label]["qual"]).all())

        # This should raise a FastqParseError since the qual scores are
        # intended to be interpreted with an offset of 64, and using 33 will
        # make the qual score fall outside the acceptable range of 0-62.
        with self.assertRaises(FastqParseError):
            list(parse_fastq(self.FASTQ_EXAMPLE, phred_offset=33))
示例#15
0
文件: parse.py 项目: Gaby1212/qiime
def parse_fastq_qual_score(fastq_lines):
    results = {}
    first_header = fastq_lines.readline()
    fastq_lines.seek(0)

    if is_casava_v180_or_later(first_header):
        ascii_to_phred_f = ascii_to_phred33
    else:
        ascii_to_phred_f = ascii_to_phred64

    for header, seq, qual in parse_fastq(fastq_lines):
        results[header] = asarray(qual, dtype=ascii_to_phred_f)
    return results
示例#16
0
def parse_fastq_qual_score(fastq_lines):
    results = {}
    first_header = fastq_lines.readline()
    fastq_lines.seek(0)

    if is_casava_v180_or_later(first_header):
        ascii_to_phred_f = ascii_to_phred33
    else:
        ascii_to_phred_f = ascii_to_phred64

    for header, seq, qual in parse_fastq(fastq_lines):
        results[header] = asarray(qual, dtype=ascii_to_phred_f)
    return results
def remove_primers(input_fastq, output_fastq,for_primers,rev_primers, ed_tol):
    count = 0
    with open(input_fastq) as read, open(output_fastq, "w") as out_seqs:
        for label,seq,qual in parse_fastq(read):
            for primerF,primerR in zip(for_primers,rev_primers):
                start_slice = editSearchForward(primerF,seq,ed_tol)
                end_slice = editSearchReverse(primerR,seq,ed_tol)
                # print type(start_slice), '\t',end_slice

            if (start_slice != -1) and (end_slice != -1):
                curr_seq = seq[start_slice:end_slice]
                curr_qual = qual[start_slice:end_slice]
                formatted_fastq_line = format_fastq_record(label, curr_seq, curr_qual)
                out_seqs.write("%s" % (formatted_fastq_line))
示例#18
0
文件: utils.py 项目: danman0091/keds
def split_pools(barcode, dirname='../data'):
    '''
    Splits the reads in R2 file of the sample specified by `barcode` into `plus`
    and `minus` pools.
    '''
    d, _, filenames = os.walk(dirname).next()
    files = [f for f in filenames if f.startswith(barcode)]
    files_R1 = [os.path.join(dirname, f) for f in files if 'R1' in f]
    files_R2 = [os.path.join(dirname, f) for f in files if 'R2' in f]
    fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'
    minus_re = re.compile('^[CT][CT][CT][AG]')
    plus_re = re.compile('^[AG][AG][AG][CT]')
    for file_R1,file_R2 in zip(files_R1, files_R2):
        print "Processing files:\t{f1}\t{f2}".format(f1=file_R1, f2=file_R2)
        sys.stdout.flush()
        cnt = 0
        cnt_plus = 0
        cnt_minus = 0
        with gzip.open(file_R1, 'rb') as gzr1, gzip.open(file_R2, 'rb') as gzr2, \
             gzip.open(os.path.join(dirname, barcode+'_minus.fastq.gz'), 'wb') as gz_minus, \
             gzip.open(os.path.join(dirname, barcode+'_plus.fastq.gz'), 'wb') as gz_plus:
            for rec1,rec2 in izip(parse_fastq(gzr1), parse_fastq(gzr2)):
                cnt += 1
                if cnt % 1000000 == 0:
                    print "\t\t{0}\trecords...".format(cnt)
                    sys.stdout.flush()
                id1, seq1, qual1 = rec1
                id2, seq2, qual2 = rec2
                qual_str = ''.join([chr(33+q) for q in qual2])
                if minus_re.match(seq1):
                    gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
                    cnt_minus += 1
                elif plus_re.match(seq1):
                    gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
                    cnt_plus += 1
            print "{0}\tplus records\t{1}\tminus records".format(cnt_plus, cnt_minus)
            sys.stdout.flush()
def remove_primers(input_fastq, output_fastq, primers):
    count = 0
    # USING regex list (Time 11m4)
    with open(input_fastq) as read, open(output_fastq, "w") as out_seqs:
        for label, seq, qual in parse_fastq(read):
            start_slice = 0
            if primers.search(seq):
                start_slice = int(primers.search(seq).end())
            curr_seq = seq[start_slice:]
            curr_qual = qual[start_slice:]
            if start_slice > 0:
                formatted_fastq_line = format_fastq_record(
                    label, curr_seq, curr_qual)
                # print ("%s" % (formatted_fastq_line))
                out_seqs.write("%s" % (formatted_fastq_line))
示例#20
0
def fetch_study(accession, base_dir):
    """Fetch and dump a full study

    Grab and dump a full study
    """
    metadata_path = os.path.join(base_dir, '%s.txt' % accession)
    fasta_path = os.path.join(base_dir, '%s.fna' % accession)

    if os.path.exists(fasta_path) and os.path.exists(metadata_path):
        # it appears we already have the accession, so short circuit
        return

    all_md = {}
    all_cols = set(['BarcodeSequence', 'LinkerPrimerSequence'])
    md_f = open(metadata_path, 'w')
    fasta_path = open(fasta_path, 'w')
    for sample, fastq_url in fetch_study_details(accession):
        # in the form seqs_000007123.1075697.fastq.gz
        # and unfortunately, the suffix (1075697) is missing and parts of the
        # current results processing depend on the suffix.
        fastq_filename = fastq_url.rsplit('/')[-1]
        qiimedb_samplename = fastq_filename.split('_')[-1].rsplit('.', 2)[0]

        md = fetch_metadata_xml(sample)
        all_md[qiimedb_samplename] = md
        all_cols.update(md)

        # write out fasta
        try:
            for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
                fasta_path.write(">%s\n%s\n" % (id_, seq))
        except:
            continue

    header = list(all_cols)
    md_f.write('#SampleID\t')
    md_f.write('\t'.join(header))
    md_f.write('\n')
    for sampleid, values in all_md.iteritems():
        to_write = [values.get(k, "no_data").encode('utf-8') for k in header]
        to_write.insert(0, sampleid)
        md_f.write('\t'.join(to_write))
        md_f.write('\n')

    md_f.close()
    fasta_path.close()
示例#21
0
def fetch_study(accession, metadata_path, fasta_path):
    """Fetch and dump a full study

    Grab and dump a full study
    """
    all_md = {}
    all_cols = set(['BarcodeSequence', 'LinkerPrimerSequence'])
    md_f = open(metadata_path, 'w')
    fasta_path = open(fasta_path, 'w')
    for sample, fastq_url in fetch_study_details(accession):
        # in the form seqs_000007123.1075697.fastq.gz
        # and unfortunately, the suffix (1075697) is missing and parts of the
        # current results processing depend on the suffix.
        fastq_filename = fastq_url.rsplit('/')[-1]
        qiimedb_samplename = fastq_filename.split('_')[-1].rsplit('.', 2)[0]

        md = fetch_metadata_xml(sample)
        all_md[qiimedb_samplename] = md
        all_cols.update(md)

        # write out fasta
        try:
            for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
                fasta_path.write(">%s\n%s\n" % (id_, seq))
        except:
            continue

    header = list(all_cols)
    md_f.write('#SampleID\t')
    md_f.write('\t'.join(header))
    md_f.write('\n')
    for sampleid, values in all_md.iteritems():
        to_write = [values.get(k, "no_data") for k in header]
        to_write.insert(0, sampleid)
        md_f.write('\t'.join(to_write))
        md_f.write('\n')

    md_f.close()
    fasta_path.close()
示例#22
0
def fetch_study(accession, metadata_path, fasta_path):
    """Fetch and dump a full study

    Grab and dump a full study
    """
    all_md = {}
    all_cols = set([])
    md_f = open(metadata_path, 'w')
    fasta_path = open(fasta_path, 'w')
    for sample, fastq_url in fetch_study_details(accession):
        # in the form seqs_000007123.1075697.fastq.gz
        # and unfortunately, the suffix (1075697) is missing and parts of the
        # current results processing depend on the suffix.
        fastq_filename = fastq_url.rsplit('/')[-1]
        qiimedb_samplename = fastq_filename.split('_')[-1].rsplit('.', 2)[0]

        md = fetch_metadata_xml(sample)
        all_md[qiimedb_samplename] = md
        all_cols.update(md)

        # write out fasta
        for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
            fasta_path.write(">%s\n%s\n" % (id_, seq))

    header = list(all_cols)
    md_f.write('#SampleID\t')
    md_f.write('\t'.join(header))
    md_f.write('\n')
    for sampleid, values in all_md.iteritems():
        to_write = [values.get(k, "no_data") for k in header]
        to_write.insert(0, sampleid)
        md_f.write('\t'.join(to_write))
        md_f.write('\n')

    md_f.close()
    fasta_path.close()
def read_fwd_rev_read(fwd_read_f,
                      rev_read_f,
                      bc_to_sid,
                      barcode_len,
                      barcode_correction_fn,
                      bc_to_fwd_primers,
                      bc_to_rev_primers,
                      max_barcode_errors,
                      fwd_length,
                      rev_length):
    """
    Reads fwd and rev read fastq files
    Parameters
    ----------
    fwd_read_f: file
        forward read fastq file
    rev_read_f: file
        reverse read fastq file
    bc_to_sid: dict
    barcode_len: int
        barcode length
    barcode_correction_fn: function
        applicable only for gloay_12 barcodes
    bc_to_fwd_primers: dict
    bc_to_rev_primers: dict
    max_barcode_errors: int
        maximum allowable errors in barcodes, applicable for golay_12
    fwd_length: int
        standard length, used for truncating of the forward sequence
    rev_length: int
        standard length, used for truncating of the reverse sequence
    Returns
    ----------
    random_bc_lookup: defaultdict
        contains sample ID -> random barcode -> list of seqs
    random_bc_reads: defaultdict
        contains sample ID -> random barcode -> number of reads
    random_bcs: list
    barcode_errors_exceed_max_count: int
    barcode_not_in_map_count: int
    primer_mismatch_count: int
    seq_too_short_count: int
    input_seqs_count: int
    total_seqs_kept: int
    """
    random_bc_lookup = defaultdict(lambda:
                                   defaultdict(lambda:
                                               defaultdict(int)))

    random_bc_reads = defaultdict(lambda:
                                  defaultdict(int))

    random_bcs = {}

    # Counts for Quality Control:
    input_seqs_count = 0
    total_seqs_kept_count = 0
    barcode_errors_exceed_max_count = 0
    barcode_not_in_map_count = 0
    primer_mismatch_count = 0
    seq_too_short_count = 0
    input_seqs_count = 0
    total_seqs_kept = 0

    header_idx = 0
    seq_idx = 1
    qual_idx = 2

    for fwd_read, rev_read in izip(parse_fastq(fwd_read_f, strict=False,
                                   enforce_qual_range=False),
                                   parse_fastq(rev_read_f,
                                   strict=False,
                                   enforce_qual_range=False)):

        # confirm match between headers

        input_seqs_count += 1

        if fwd_read[header_idx] != rev_read[header_idx]:
            raise PairedEndParseError(
                "Headers of forward and reverse reads "
                "do not match. Confirm that the forward "
                "and reverse read fastq files that you "
                "provided have headers that match one "
                "another.")
        else:
            header = fwd_read[header_idx]

        fwd_seq = fwd_read[seq_idx]
        rev_seq = rev_read[seq_idx]

        #  Grab the barcode sequence. It is always at the very end of the
        #  forward read. Strip the barcode from the sequence.
        barcode = fwd_seq[-barcode_len:]
        fwd_seq = fwd_seq[:-barcode_len]

        #  Correct the barcode(if applicable) and map to sample ID.
        num_barcode_errors, corrected_barcode, _, sample_id =\
            correct_barcode(barcode, bc_to_sid, barcode_correction_fn)

        #  Skip barcodes with too many errors.
        if num_barcode_errors > max_barcode_errors:
            barcode_errors_exceed_max_count += 1
            continue

        if sample_id is None:
            barcode_not_in_map_count += 1
            continue

        # Extract the random barcode and primer from the forward read.
        possible_primers = bc_to_fwd_primers[corrected_barcode].keys()

        try:
            random_bc, _, clean_fwd_seq = extract_primer(fwd_seq,
                                                         possible_primers,
                                                         min_idx=5,
                                                         max_idx=20)

            random_bcs[sample_id].append(random_bc)
        except PrimerMismatchError:
            primer_mismatch_count += 1
            continue
        except KeyError:
            random_bcs[sample_id] = list()
            random_bcs[sample_id].append(random_bc)

        possible_primers = bc_to_rev_primers[barcode]

        try:
            phase_seq, _, clean_rev_seq = extract_primer(rev_seq,
                                                         possible_primers)
        except PrimerMismatchError:
            primer_mismatch_count += 1
            continue

        if len(clean_fwd_seq) < fwd_length:
            seq_too_short_count += 1
            continue

        clean_fwd_seq = clean_fwd_seq[:fwd_length]
        clean_rev_seq = clean_rev_seq[:rev_length]

        total_seqs_kept += 1
        random_bc_reads[sample_id][random_bc] += 1
        random_bc_lookup[sample_id][random_bc][
            (clean_fwd_seq, clean_rev_seq)] += 1

    return (random_bc_lookup,
            random_bc_reads,
            random_bcs,
            barcode_errors_exceed_max_count,
            barcode_not_in_map_count,
            primer_mismatch_count,
            seq_too_short_count,
            input_seqs_count,
            total_seqs_kept)
示例#24
0
def extract_barcodes(fastq1,
                     fastq2=None,
                     output_dir=".",
                     input_type="barcode_single_end",
                     bc1_len=6,
                     bc2_len=6,
                     rev_comp_bc1=False,
                     rev_comp_bc2=False,
                     char_delineator=":",
                     switch_bc_order=False,
                     map_fp=None,
                     attempt_read_orientation=False,
                     disable_header_match=False):
    """ Main program function for extracting barcodes from reads

    fastq1: Open fastq file 1.
    fastq2: None or open fastq file 2.
    output_dir: Directory to write output parses sequences to.
    input_type: Specifies the type of parsing to be done.
    bc1_len: Length of barcode 1 to be parsed from fastq1
    bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a
     stitched read.
    rev_comp_bc1: If True, reverse complement bc1 before writing.
    rev_comp_bc2: If True, reverse complement bc2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    map_fp: open file object of mapping file, requires a LinkerPrimerSequence
        and ReversePrimer field to be present. Used for orienting reads.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    disable_header_match: if True, suppresses checks between fastq headers.
    """

    # Turn off extra file creation for single read.
    if input_type == "barcode_single_end" and attempt_read_orientation:
        attempt_read_orientation = False
    if attempt_read_orientation:
        header, mapping_data, run_description, errors, warnings =\
            process_id_map(map_fp)
        forward_primers, reverse_primers = get_primers(header, mapping_data)
        output_bc_not_oriented = open(
            join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w")
        fastq1_out_not_oriented = open(
            join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w")
        fastq2_out_not_oriented = open(
            join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w")
    else:
        forward_primers = None
        reverse_primers = None
        output_bc_not_oriented = None
        fastq1_out_not_oriented = None
        fastq2_out_not_oriented = None

    output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w")
    if input_type in ["barcode_single_end", "barcode_paired_stitched"]:
        output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w")
        output_fastq2 = None
        final_fastq1_name = join(output_dir, "reads.fastq")
    elif input_type in ["barcode_paired_end"]:
        output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w")
        output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w")
        final_fastq1_name = join(output_dir, "reads1.fastq")
    else:
        output_fastq1 = None
        output_fastq2 = None

    if not fastq2:
        fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"])
        not_paired = True
    else:
        not_paired = False

    check_header_match_f = get_casava_version(fastq1)

    header_index = 0

    for read1_data, read2_data in izip(
            parse_fastq(fastq1, strict=False, enforce_qual_range=False),
            parse_fastq(fastq2, strict=False, enforce_qual_range=False)):
        if not disable_header_match:
            if not check_header_match_f(read1_data[header_index],
                                        read2_data[header_index]):
                raise FastqParseError(
                    "Headers of read1 and read2 do not match. Can't continue. "
                    "Confirm that the fastq sequences that you are "
                    "passing match one another. --disable_header_match can be "
                    "used to suppress header checks.")

        if input_type == "barcode_single_end":
            process_barcode_single_end_data(read1_data, output_bc_fastq,
                                            output_fastq1, bc1_len,
                                            rev_comp_bc1)

        elif input_type == "barcode_paired_end":
            process_barcode_paired_end_data(
                read1_data, read2_data, output_bc_fastq, output_fastq1,
                output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2,
                attempt_read_orientation, forward_primers, reverse_primers,
                output_bc_not_oriented, fastq1_out_not_oriented,
                fastq2_out_not_oriented)

        elif input_type == "barcode_paired_stitched":
            process_barcode_paired_stitched(
                read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len,
                rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                forward_primers, reverse_primers, output_bc_not_oriented,
                fastq1_out_not_oriented, switch_bc_order)

        elif input_type == "barcode_in_label":
            if not_paired:
                curr_read2_data = False
            else:
                curr_read2_data = read2_data
            process_barcode_in_label(read1_data, curr_read2_data,
                                     output_bc_fastq, bc1_len, bc2_len,
                                     rev_comp_bc1, rev_comp_bc2,
                                     char_delineator)

    output_bc_fastq.close()
    rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq"))
    if output_fastq1:
        output_fastq1.close()
        rename(output_fastq1.name, final_fastq1_name)
    if output_fastq2:
        output_fastq2.close()
        rename(output_fastq2.name, join(output_dir, "reads2.fastq"))
    if output_bc_not_oriented:
        rename(output_bc_not_oriented.name,
               join(output_dir, "barcodes_not_oriented.fastq"))
    if fastq1_out_not_oriented:
        rename(fastq1_out_not_oriented.name,
               join(output_dir, "reads1_not_oriented.fastq"))
    if fastq2_out_not_oriented:
        rename(fastq2_out_not_oriented.name,
               join(output_dir, "reads2_not_oriented.fastq"))
示例#25
0
def extract_barcodes(fastq1,
                     fastq2=None,
                     output_dir=".",
                     input_type="barcode_single_end",
                     bc1_len=6,
                     bc2_len=6,
                     rev_comp_bc1=False,
                     rev_comp_bc2=False,
                     char_delineator=":",
                     switch_bc_order=False,
                     map_fp=None,
                     attempt_read_orientation=False,
                     disable_header_match=False):
    """ Main program function for extracting barcodes from reads

    fastq1: Open fastq file 1.
    fastq2: None or open fastq file 2.
    output_dir: Directory to write output parses sequences to.
    input_type: Specifies the type of parsing to be done.
    bc1_len: Length of barcode 1 to be parsed from fastq1
    bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a
     stitched read.
    rev_comp_bc1: If True, reverse complement bc1 before writing.
    rev_comp_bc2: If True, reverse complement bc2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    map_fp: open file object of mapping file, requires a LinkerPrimerSequence
        and ReversePrimer field to be present. Used for orienting reads.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    disable_header_match: if True, suppresses checks between fastq headers.
    """

    # Turn off extra file creation for single read.
    if input_type == "barcode_single_end" and attempt_read_orientation:
        attempt_read_orientation = False
    if attempt_read_orientation:
        header, mapping_data, run_description, errors, warnings =\
            process_id_map(map_fp)
        forward_primers, reverse_primers = get_primers(header, mapping_data)
        output_bc_not_oriented = open(join(output_dir,
                                           "barcodes_not_oriented.fastq.incomplete"), "w")
        fastq1_out_not_oriented = open(join(output_dir,
                                            "reads1_not_oriented.fastq.incomplete"), "w")
        fastq2_out_not_oriented = open(join(output_dir,
                                            "reads2_not_oriented.fastq.incomplete"), "w")
    else:
        forward_primers = None
        reverse_primers = None
        output_bc_not_oriented = None
        fastq1_out_not_oriented = None
        fastq2_out_not_oriented = None

    output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w")
    if input_type in ["barcode_single_end", "barcode_paired_stitched"]:
        output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w")
        output_fastq2 = None
        final_fastq1_name = join(output_dir, "reads.fastq")
    elif input_type in ["barcode_paired_end"]:
        output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w")
        output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w")
        final_fastq1_name = join(output_dir, "reads1.fastq")
    else:
        output_fastq1 = None
        output_fastq2 = None

    if not fastq2:
        fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"])
        not_paired = True
    else:
        not_paired = False

    check_header_match_f = get_casava_version(fastq1)

    header_index = 0

    for read1_data, read2_data in izip(
            parse_fastq(fastq1, strict=False, enforce_qual_range=False),
            parse_fastq(fastq2, strict=False, enforce_qual_range=False)):
        if not disable_header_match:
            if not check_header_match_f(read1_data[header_index],
                                        read2_data[header_index]):
                raise FastqParseError("Headers of read1 and read2 do not match. Can't continue. "
                                      "Confirm that the fastq sequences that you are "
                                      "passing match one another. --disable_header_match can be "
                                      "used to suppress header checks.")

        if input_type == "barcode_single_end":
            process_barcode_single_end_data(read1_data, output_bc_fastq,
                                            output_fastq1, bc1_len, rev_comp_bc1)

        elif input_type == "barcode_paired_end":
            process_barcode_paired_end_data(read1_data, read2_data,
                                            output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len,
                                            rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                                            forward_primers, reverse_primers, output_bc_not_oriented,
                                            fastq1_out_not_oriented, fastq2_out_not_oriented)

        elif input_type == "barcode_paired_stitched":
            process_barcode_paired_stitched(read1_data,
                                            output_bc_fastq, output_fastq1, bc1_len, bc2_len,
                                            rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                                            forward_primers, reverse_primers, output_bc_not_oriented,
                                            fastq1_out_not_oriented, switch_bc_order)

        elif input_type == "barcode_in_label":
            if not_paired:
                curr_read2_data = False
            else:
                curr_read2_data = read2_data
            process_barcode_in_label(read1_data, curr_read2_data,
                                     output_bc_fastq, bc1_len, bc2_len,
                                     rev_comp_bc1, rev_comp_bc2, char_delineator)

    output_bc_fastq.close()
    rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq"))
    if output_fastq1:
        output_fastq1.close()
        rename(output_fastq1.name, final_fastq1_name)
    if output_fastq2:
        output_fastq2.close()
        rename(output_fastq2.name, join(output_dir, "reads2.fastq"))
    if output_bc_not_oriented:
        rename(output_bc_not_oriented.name,
               join(output_dir, "barcodes_not_oriented.fastq"))
    if fastq1_out_not_oriented:
        rename(fastq1_out_not_oriented.name,
               join(output_dir, "reads1_not_oriented.fastq"))
    if fastq2_out_not_oriented:
        rename(fastq2_out_not_oriented.name,
               join(output_dir, "reads2_not_oriented.fastq"))
示例#26
0
    def test_parse_error(self):
        with self.assertRaises(FastqParseError):
            list(parse_fastq(self.FASTQ_EXAMPLE_2, strict=True))

        with self.assertRaises(FastqParseError):
            list(parse_fastq(self.FASTQ_EXAMPLE_3, phred_offset=64))
示例#27
0
 def test_invalid_phred_offset(self):
     with self.assertRaises(ValueError):
         list(parse_fastq(self.FASTQ_EXAMPLE, phred_offset=42))
示例#28
0
def filter_fastq_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False):
    """Filter a fastq file to include only sequences listed in seqs_to_keep """
    input_seqs = parse_fastq(open(input_seqs_fp, 'U'), strict=False)
    output_f = open(output_seqs_fp, 'w')
    return filter_fastq(input_seqs, output_f, seqs_to_keep, negate)
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_offset=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2

    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]

    if phred_offset is None:
        post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
        if post_casava_v180:
            phred_offset = 33
        else:
            phred_offset = 64

    if phred_offset == 33:
        check_header_match_f = check_header_match_180_or_later
    elif phred_offset == 64:
        check_header_match_f = check_header_match_pre180
    else:
        raise ValueError("Invalid PHRED offset: %d" % phred_offset)

    # compute the barcode length, if they are all the same.
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set(
        [len(bc) for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None

    # compute the minimum read length as a fraction of the length of the input
    # read
    min_per_read_length = min_per_read_length_fraction * \
        len(fastq_read_f_line2)

    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data, read_data in izip(
            parse_fastq(fastq_barcode_f,
                        strict=False,
                        phred_offset=phred_offset),
            parse_fastq(fastq_read_f, strict=False,
                        phred_offset=phred_offset)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index], read_data[header_index])):
            raise FastqParseError(
                "Headers of barcode and read do not match. Can't continue. "
                "Confirm that the barcode fastq and read fastq that you are "
                "passing match one another.")
        else:
            header = read_data[header_index]

        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for
            # techical reasons, this step looks only at the
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = str(DNA(barcode).rc())
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]

        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
            correct_barcode(
                barcode,
                barcode_to_sample_id,
                barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
            count_barcode_errors_exceed_max += 1
            continue

        # skip unassignable samples unless otherwise requested
        if sample_id is None:
            if not store_unassigned:
                count_barcode_not_in_map += 1
                continue
            else:
                sample_id = 'Unassigned'

        quality_filter_result, sequence, quality =\
            quality_filter_sequence(header,
                                    sequence,
                                    quality,
                                    max_bad_run_length,
                                    phred_quality_threshold,
                                    min_per_read_length,
                                    seq_max_N,
                                    filter_bad_illumina_qual_digit)

        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError("Unknown quality filter result: %d" %
                                 quality_filter_result)
            continue

        sequence_lengths.append(len(sequence))

        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1

        if rev_comp:
            sequence = str(DNA(sequence).rc())
            quality = quality[::-1]

        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
            (sample_id, seq_id, header, barcode,
             corrected_barcode, num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1

    # Add sample IDs with zero counts to dictionary for logging
    for curr_sample_id in barcode_to_sample_id.values():
        if curr_sample_id not in seqs_per_sample_counts.keys():
            seqs_per_sample_counts[curr_sample_id] = 0

    if log_f is not None:
        log_str = format_split_libraries_fastq_log(
            count_barcode_not_in_map, count_too_short, count_too_many_N,
            count_bad_illumina_qual_digit, count_barcode_errors_exceed_max,
            input_sequence_count, sequence_lengths, seqs_per_sample_counts)
        log_f.write(log_str)

    if len(sequence_lengths) and histogram_f is not None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts, bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')
def convert_fastaqual(fasta_file_path,
                      output_directory='.',
                      multiple_output_files=False,
                      ascii_increment=33,
                      full_fastq=False,
                      full_fasta_headers=False,
                      per_file_buffer_size=100000):
    '''Takes a FASTQfile, generates FASTA and QUAL file(s)
    fasta_file_path:  filepath of input FASTQ file.
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''

    # rename this to avoid confusion...
    fastq_fp = fasta_file_path

    # if we are NOT using multiple output files, then open our two (and only)
    # output files here
    if not multiple_output_files:
        fasta_out_fp = get_filename_with_new_ext(fastq_fp, '.fna',
                                                 output_directory)
        qual_out_fp = get_filename_with_new_ext(fastq_fp, '.qual',
                                                output_directory)

        fasta_out_f = open(fasta_out_fp, 'w')
        qual_out_f = open(qual_out_fp, 'w')

    else:
        fasta_out_lookup = defaultdict(str)
        qual_out_lookup = defaultdict(str)

    fpo = ascii_increment
    for header, sequence, qual in parse_fastq(open(fastq_fp, 'U'),
                                              strict=False,
                                              phred_offset=fpo):
        label = header.split()[0]
        sample_id = label.split('_')[0]

        if multiple_output_files:
            fasta_out_fp = get_filename_with_new_ext(fastq_fp,
                                                     '_' + sample_id + '.fna',
                                                     output_directory)

            qual_out_fp = get_filename_with_new_ext(fastq_fp,
                                                    '_' + sample_id + '.qual',
                                                    output_directory)

        if full_fasta_headers:
            label = header

        if (qual < 0).any():
            raise ValueError("Output qual scores are negative values. "
                             "Use different ascii_increment value than %s" %
                             str(ascii_increment))

        # write QUAL file, 60 qual scores per line
        qual_record = [">%s\n" % label]
        for i in range(0, len(qual), 60):
            qual_record.append(' '.join([str(q) for q in qual[i:i + 60]]))
            qual_record.append('\n')
        qual_record = ''.join(qual_record)

        if multiple_output_files:
            qual_out_lookup[qual_out_fp] += qual_record
        else:
            qual_out_f.write(qual_record)

        # write FASTA file
        fasta_record = '>%s\n%s\n' % (label, sequence)
        if multiple_output_files:
            fasta_out_lookup[fasta_out_fp] += fasta_record
        else:
            fasta_out_f.write(fasta_record)

        # if we're writing multiple output files, we must close after each
        # sequeunce write to avoid potentiallyusing up all the OS's filehandles
        if multiple_output_files:
            if fasta_out_lookup[fasta_out_fp] >= per_file_buffer_size:
                fasta_f = open(fasta_out_fp, 'a')
                fasta_f.write(fasta_out_lookup[fasta_out_fp])
                fasta_f.close()
                fasta_out_lookup[fasta_out_fp] = ''

                qual_f = open(qual_out_fp, 'a')
                qual_f.write(qual_out_lookup[qual_out_fp])
                qual_f.close()
                qual_out_lookup[qual_out_fp] = ''

    # if we have one output file, close it now
    if multiple_output_files:
        for fasta_out_fp, records in fasta_out_lookup.iteritems():
            if records:
                fasta_f = open(fasta_out_fp, 'a')
                fasta_f.write(records)
                fasta_f.close()

        for qual_out_fp, records in qual_out_lookup.iteritems():
            if records:
                qual_f = open(qual_out_fp, 'a')
                qual_f.write(records)
                qual_f.close()
    else:
        fasta_out_f.close()
        qual_out_f.close()
示例#31
0
def filter_fastq_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False):
    """Filter a fastq file to include only sequences listed in seqs_to_keep """
    input_seqs = parse_fastq(open(input_seqs_fp, "U"), strict=False)
    output_f = open(output_seqs_fp, "w")
    return filter_fastq(input_seqs, output_f, seqs_to_keep, negate)
def convert_fastaqual(fasta_file_path, output_directory='.',
                      multiple_output_files=False, ascii_increment=33,
                      full_fastq=False, full_fasta_headers=False,
                      per_file_buffer_size=100000):
    '''Takes a FASTQfile, generates FASTA and QUAL file(s)

    fasta_file_path:  filepath of input FASTQ file.
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''

    # rename this to avoid confusion...
    fastq_fp = fasta_file_path

    # if we are NOT using multiple output files, then open our two (and only)
    # output files here
    if not multiple_output_files:
        fasta_out_fp = get_filename_with_new_ext(fastq_fp,
                                                 '.fna',
                                                 output_directory)
        qual_out_fp = get_filename_with_new_ext(fastq_fp,
                                                '.qual',
                                                output_directory)

        fasta_out_f = open(fasta_out_fp, 'w')
        qual_out_f = open(qual_out_fp, 'w')

    else:
        fasta_out_lookup = defaultdict(str)
        qual_out_lookup = defaultdict(str)

    fpo = ascii_increment
    for header, sequence, qual in parse_fastq(open(fastq_fp, 'U'),
                                              strict=False,
                                              phred_offset=fpo):
        label = header.split()[0]
        sample_id = label.split('_')[0]

        if multiple_output_files:
            fasta_out_fp = get_filename_with_new_ext(fastq_fp,
                                                     '_' + sample_id + '.fna',
                                                     output_directory)

            qual_out_fp = get_filename_with_new_ext(fastq_fp,
                                                    '_' + sample_id + '.qual',
                                                    output_directory)

        if full_fasta_headers:
            label = header

        if (qual < 0).any():
            raise ValueError("Output qual scores are negative values. "
                             "Use different ascii_increment value than %s" %
                             str(ascii_increment))

        # write QUAL file, 60 qual scores per line
        qual_record = [">%s\n" % label]
        for i in range(0, len(qual), 60):
            qual_record.append(' '.join([str(q) for q in qual[i:i + 60]]))
            qual_record.append('\n')
        qual_record = ''.join(qual_record)

        if multiple_output_files:
            qual_out_lookup[qual_out_fp] += qual_record
        else:
            qual_out_f.write(qual_record)

        # write FASTA file
        fasta_record = '>%s\n%s\n' % (label, sequence)
        if multiple_output_files:
            fasta_out_lookup[fasta_out_fp] += fasta_record
        else:
            fasta_out_f.write(fasta_record)

        # if we're writing multiple output files, we must close after each
        # sequeunce write to avoid potentiallyusing up all the OS's filehandles
        if multiple_output_files:
            if fasta_out_lookup[fasta_out_fp] >= per_file_buffer_size:
                fasta_f = open(fasta_out_fp, 'a')
                fasta_f.write(fasta_out_lookup[fasta_out_fp])
                fasta_f.close()
                fasta_out_lookup[fasta_out_fp] = ''

                qual_f = open(qual_out_fp, 'a')
                qual_f.write(qual_out_lookup[qual_out_fp])
                qual_f.close()
                qual_out_lookup[qual_out_fp] = ''

    # if we have one output file, close it now
    if multiple_output_files:
        for fasta_out_fp, records in fasta_out_lookup.iteritems():
            if records:
                fasta_f = open(fasta_out_fp, 'a')
                fasta_f.write(records)
                fasta_f.close()

        for qual_out_fp, records in qual_out_lookup.iteritems():
            if records:
                qual_f = open(qual_out_fp, 'a')
                qual_f.write(records)
                qual_f.close()
    else:
        fasta_out_f.close()
        qual_out_f.close()
示例#33
0
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_to_ascii_f=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2

    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]

    post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
    if post_casava_v180:
        offset = 33
        check_header_match_f = check_header_match_180_or_later
    else:
        offset = 64
        check_header_match_f = check_header_match_pre180

    # compute the barcode length, if they are all the same.
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set([len(bc)
                          for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None

    # compute the minimum read length as a fraction of the length of the input
    # read
    min_per_read_length = min_per_read_length_fraction * \
        len(fastq_read_f_line2)

    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data, read_data in izip(
            parse_fastq(fastq_barcode_f, strict=False, phred_offset=offset),
            parse_fastq(fastq_read_f, strict=False, phred_offset=offset)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index], read_data[header_index])):
            raise FastqParseError("Headers of barcode and read do not match. Can't continue. "
                                  "Confirm that the barcode fastq and read fastq that you are "
                                  "passing match one another.")
        else:
            header = read_data[header_index]

        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for
            # techical reasons, this step looks only at the
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = str(DNA(barcode).rc())
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]

        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
            correct_barcode(
                barcode,
                barcode_to_sample_id,
                barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
            count_barcode_errors_exceed_max += 1
            continue

        # skip unassignable samples unless otherwise requested
        if sample_id is None:
            if not store_unassigned:
                count_barcode_not_in_map += 1
                continue
            else:
                sample_id = 'Unassigned'

        quality_filter_result, sequence, quality =\
            quality_filter_sequence(header,
                                    sequence,
                                    quality,
                                    max_bad_run_length,
                                    phred_quality_threshold,
                                    min_per_read_length,
                                    seq_max_N,
                                    filter_bad_illumina_qual_digit)

        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError(
                    "Unknown quality filter result: %d" %
                    quality_filter_result)
            continue

        sequence_lengths.append(len(sequence))

        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1

        if rev_comp:
            sequence = str(DNA(sequence).rc())
            quality = quality[::-1]

        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
            (sample_id, seq_id, header, barcode,
             corrected_barcode, num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1

    # Add sample IDs with zero counts to dictionary for logging
    for curr_sample_id in barcode_to_sample_id.values():
        if curr_sample_id not in seqs_per_sample_counts.keys():
            seqs_per_sample_counts[curr_sample_id] = 0

    if log_f is not None:
        log_str = format_split_libraries_fastq_log(count_barcode_not_in_map,
                                                   count_too_short,
                                                   count_too_many_N,
                                                   count_bad_illumina_qual_digit,
                                                   count_barcode_errors_exceed_max,
                                                   input_sequence_count,
                                                   sequence_lengths,
                                                   seqs_per_sample_counts)
        log_f.write(log_str)

    if len(sequence_lengths) and histogram_f is not None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts, bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')