示例#1
0
    def test_sequence_reader(self):
        # test the autodetection
        with dnaio.open("tests/data/simple.fastq") as f:
            reads = list(f)
        assert reads == simple_fastq

        with dnaio.open("tests/data/simple.fasta") as f:
            reads = list(f)
        assert reads == simple_fasta

        with open("tests/data/simple.fastq", 'rb') as f:
            reads = list(dnaio.open(f))
        assert reads == simple_fastq

        # make the name attribute unavailable
        with open("tests/data/simple.fastq", 'rb') as f:
            data = f.read()
        bio = BytesIO(data)
        reads = list(dnaio.open(bio))
        assert reads == simple_fastq
        with open("tests/data/simple.fasta", 'rb') as f:
            data = f.read()
        bio = BytesIO(data)
        reads = list(dnaio.open(bio))
        assert reads == simple_fasta
示例#2
0
 def __call__(self, read1, read2, matches1, matches2):
     """
     Write the read to the proper output file according to the most recent matches both on
     R1 and R2
     """
     assert read2 is not None
     name1 = matches1[-1].adapter.name if matches1 else None
     name2 = matches2[-1].adapter.name if matches2 else None
     key = (name1, name2)
     if key not in self.writers:
         if name1 is None:
             name1 = self.untrimmed_name
         if name2 is None:
             name2 = self.untrimmed_name
         if name1 is None or name2 is None:
             return DISCARD
         path1 = self._make_path(self.template, name1, name2)
         path2 = self._make_path(self.paired_template, name1, name2)
         self.writers[key] = (
             dnaio.open(path1, mode='w', qualities=self.qualities),
             dnaio.open(path2, mode='w', qualities=self.qualities),
         )
     writer1, writer2 = self.writers[key]
     self.written += 1
     self.written_bp[0] += len(read1)
     self.written_bp[1] += len(read2)
     writer1.write(read1)
     writer2.write(read2)
     return DISCARD
示例#3
0
 def test_autodetect_fastq_format(self):
     path = os.path.join(self._tmpdir, 'tmp.fastq')
     with dnaio.open(path, mode='w') as f:
         assert isinstance(f, FastqWriter)
         for seq in simple_fastq:
             f.write(seq)
     with dnaio.open(path) as f:
         assert list(f) == simple_fastq
示例#4
0
 def test_write_qualities_to_fasta(self):
     path = os.path.join(self._tmpdir, 'tmp.fasta')
     with dnaio.open(path, mode='w', qualities=True) as f:
         assert isinstance(f, FastaWriter)
         for seq in simple_fastq:
             f.write(seq)
     with dnaio.open(path) as f:
         assert list(f) == simple_fasta
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    if args.fq2:
        input_fastq = args.fq + args.fq2
    else:
        input_fastq = args.fq

    for fq_file in input_fastq:
        if not isfile(fq_file):
            set_error('Input file {} does not exist'.format(basename(fq_file)))
        if not fq_file.lower().endswith(SUPPORTED_EXTENSIONS):
            set_error('Unrecognized file name extension in file {}. '
                      'Supported file name extensions are {}.'.format(fq_file, SUPPORTED_EXTENSIONS))

    # Reduce the probability of uploading the FASTQ files with the same
    # content multiple times (as multiple lanes or mates).
    if len(set(input_fastq)) != len(input_fastq):
        seen_files = [item for item, count in collections.Counter(input_fastq).items() if count > 1]
        set_error('Non-unique input file names detected: {}.'.format(seen_files))

    if args.fq2 and len(args.fq) != len(args.fq2):
        set_error('The number of mate-pair files in split-lane samples must match. '
                  '{} and {} input files were given for the -fq and -fq2 inputs, '
                  'respectively.'.format(len(args.fq), len(args.fq2)))

    if args.fq2:
        for mate1, mate2 in zip(args.fq, args.fq2):
            try:
                paired_reads = dnaio.open(mate1, file2=mate2, fileformat='fastq')
                if not any(paired_reads):
                    set_error(
                        'Mate-pair files {} and {} contain no read sequences.'.format(basename(mate1), basename(mate2))
                    )
                else:
                    for read in paired_reads:
                        continue
                    print('Successfully validated mate-pair files {} and {}.'.format(basename(mate1), basename(mate2)))

            except (FastqFormatError, FileFormatError) as dnaio_error:
                set_error(
                    'Format error in mate-pairs {} and {}. {}'.format(
                        basename(mate1), basename(mate2), str(dnaio_error))
                )
    else:
        for fq in args.fq:
            try:
                reads = dnaio.open(fq, fileformat='fastq')
                if not any(reads):
                    set_error('Input file {} contains no read sequences.'.format(basename(fq)))
                else:
                    for read in reads:
                        continue
                    print('Successfully validated reads file {}.'.format(basename(fq)))

            except (FastqFormatError, FileFormatError) as dnaio_error:
                set_error('Error in file {}. {}'.format(basename(fq), str(dnaio_error)))
示例#6
0
 def test_autodetect_fasta_format(self, tmpdir):
     path = str(tmpdir.join('tmp.fasta'))
     with dnaio.open(path, mode='w') as f:
         assert isinstance(f, FastaWriter)
         for seq in simple_fastq:
             f.write(seq)
     with dnaio.open(path) as f:
         records = list(f)
     assert records == simple_fasta
示例#7
0
def main(args):
    logger.info(f"Filtering reads not of length {args.length} bp.")

    time_start = time.time()

    # Read ABC fasta with UMI sequences and save read name and sequence.
    with dnaio.open(args.abcfile, mode="r") as file:
        umis = get_umis(file, length=args.length)

    time_filtered = time.time()
    logger.info(f"Time for filtering: {time_filtered - time_start} s")
    logger.info(f"Assigning UMIs to DBS clusters")

    with dnaio.open(args.dbsfile, mode="r") as file:
        dbs_umis = assign_to_dbs(file, umis)

    logger.info(f"DBS clusters linked to ABC: {len(dbs_umis)}")

    time_assign = time.time()
    logger.info(f"Time for assigning clusters: {time_assign - time_filtered} s")
    logger.info(f"Starting clustering of UMIs within clusters.")

    # Set clustering method
    # Based on https://umi-tools.readthedocs.io/en/latest/API.html
    clusterer = UMIClusterer(cluster_method='directional')

    with dnaio.open(args.output, fileformat="fasta", mode="w") as output:
        for dbs, umis in dbs_umis.items():
            # Encode each UMI for UMITools and perpare counts
            counts = {bytes(umi, encoding='utf-8'): len(reads) for umi, reads in umis.items()}

            stats["Total UMIs"] += len(counts)

            # Cluster umis
            clustered_umis = clusterer(counts, threshold=args.threshold)

            stats["Total clustered UMIs"] += len(clustered_umis)

            # Loop over clusters and write reads with corrected UMI.
            for cluster in clustered_umis:
                seqs = [seq.decode("utf-8") for seq in cluster]
                canonical_sequnce = seqs[0]

                for seq in seqs:
                    for read_name in umis[seq]:
                        read = dnaio.Sequence(read_name, canonical_sequnce)
                        output.write(read)

    time_end = time.time()
    logger.info(f"Time for clustering: {time_end - time_assign} s")
    logger.info(f"Total time to run: {time_end - time_start} s")

    # Send stats to log
    logger.info(f"Reads filtered out: {stats['Reads filtered out']:,}")
    logger.info(f"Reads kept: {stats['Reads kept']}")
    logger.info(f"Total UMIs: {stats['Total UMIs']}")
    logger.info(f"Total clustered UMIs: {stats['Total clustered UMIs']}")
示例#8
0
def split_fastq_reads(fastq_path,
                      output_path,
                      trim_b=0,
                      size_l=40,
                      size_r=40,
                      size_m=30):
    """
    Split reads in the fastq file into three parts for remapping.
    Depending on the read length, reads may be
    1) skipped,
    2) split into left and right parts,
    3) split into left, right and middle parts

    left size size_l (name with -l suffix)
    right size size_r (name with -r suffix)
    middle size size_m (name with -m suffix)

    Parameters
    ----------
    fastq_path
    output_path
    trim_b
    size_l
    size_r
    size_m

    Returns
    -------

    """
    trim_b = int(trim_b)
    size_max = max(size_l, size_r)
    with dnaio.open(fastq_path) as f, \
            dnaio.open(output_path, mode='w') as out_f:
        for read in f:
            if trim_b > 0:
                read = read[trim_b:-trim_b]
            read_length = len(read)
            if read_length <= size_max:
                continue
            else:
                # split reads to left and right part, they may have overlap
                left_read = read[:size_l]
                left_read.name += '-l'
                out_f.write(left_read)

                right_read = read[-size_r:]
                right_read.name += '-r'
                out_f.write(right_read)

                # if the middle part is longer enough, we also use it
                if read_length >= (size_l + size_r + size_m):
                    middle_read = read[size_l:-size_r]
                    middle_read.name += '-m'
                    out_f.write(middle_read)
    return
示例#9
0
 def test_autodetect_fastq_weird_name(self):
     path = os.path.join(self._tmpdir, 'tmp.fastq.gz')
     with dnaio.open(path, mode='w') as f:
         assert isinstance(f, FastqWriter)
         for seq in simple_fastq:
             f.write(seq)
     weird_path = os.path.join(self._tmpdir, 'tmp.weird.gz')
     os.rename(path, weird_path)
     with dnaio.open(weird_path) as f:
         assert list(f) == simple_fastq
示例#10
0
def test_append(tmp_path, fileformat, extension):
    s1 = dnaio.SequenceRecord("s1", "ACGT", "HHHH")
    s2 = dnaio.SequenceRecord("s2", "CGCA", "8383")
    path = tmp_path / ("out." + fileformat + extension)
    with dnaio.open(path, mode="w") as f:
        f.write(s1)
    with dnaio.open(path, mode="a") as f:
        f.write(s2)
    with xopen(path) as f:
        assert formatted_sequences([s1, s2], fileformat) == f.read()
示例#11
0
def main(args):
    logger.info(f'Starting')

    progress = BLR.ProgressReporter('Read pairs processed', 1000000)

    input_interleaved = True if not args.input2 else False
    logger.info(
        f"Input detected as {'interleaved fastq.' if input_interleaved else 'paired fastq.'}"
    )

    # If no output1 is given output is sent to stdout
    if not args.output1:
        logger.info(f"Writing output to stdout.")
        args.output1 = sys.stdout.buffer
        args.output2 = None

    output_interleaved = True if not args.output2 else False
    logger.info(
        f"Output detected as {'interleaved fastq.' if output_interleaved else 'paired fastq.'}"
    )

    reader = dnaio.open(args.input1,
                        file2=args.input2,
                        interleaved=input_interleaved,
                        mode="r",
                        fileformat="fastq")
    writer = dnaio.open(args.output1,
                        file2=args.output2,
                        interleaved=output_interleaved,
                        mode="w",
                        fileformat="fastq")
    for read1, read2 in reader:
        # Adjusting for BC
        bc_seq = read1.sequence[:20]
        read1.sequence = read1.sequence[20:]
        read1.qualities = read1.qualities[20:]

        # Header parsing
        name_and_pos_r1, read_and_index_r1 = read1.name.split(maxsplit=1)
        name_and_pos_r2, read_and_index_r2 = read2.name.split(maxsplit=1)

        # Save header to read instances
        read1.name = name_and_pos_r1 + '_' + bc_seq + ' ' + read_and_index_r1
        read2.name = name_and_pos_r2 + '_' + bc_seq + ' ' + read_and_index_r2

        # Write to out
        writer.write(read1, read2)

        # Progress reporting
        progress.update()

    reader.close()
    writer.close()
    logger.info(f'Finished')
示例#12
0
def main(args):
    if args.rename_from:
        with dnaio.open(args.rename_from) as fr:
            template = PrefixDict([])
            for record in fr:
                try:
                    template.add(record.sequence.upper(), record.name)
                except ValueError:
                    logger.error('Sequences in entry %r and %r are duplicate',
                        record.name, template[record.sequence.upper()])
        logger.info('Read %d entries from template', len(template))
    else:
        template = None

    if args.order_by:
        with dnaio.open(args.order_by) as fr:
            gene_order = [gene_name(r) for r in fr]
    else:
        gene_order = None

    with dnaio.open(args.target) as fr:
        sequences = list(fr)

    # Rename
    renamed = 0
    if template is not None:
        for record in sequences:
            name = template.get(record.sequence.upper())
            if name is None:
                name = record.name + args.not_found
            else:
                renamed += 1
            # Replace record’s name, leaving comment intact
            record_name, _, record_comment = record.name.partition(' ')
            if record_comment:
                record.name = name + ' ' + record_comment
            else:
                record.name = name

    # Reorder
    if gene_order:
        try:
            sequences = sorted_by_gene(sequences, gene_order)
        except GeneMissing as e:
            logger.error('Gene "%s" not found in the --order-by template file', e)
            sys.exit(1)
    elif args.sort:
        sequences = sorted(sequences, key=lambda s: natural_sort_key(s.name))
    for record in sequences:
        print('>{}\n{}'.format(record.name, record.sequence))
    logger.info('Wrote %s FASTA records (%d sequences found in template)', len(sequences), renamed)
示例#13
0
def _open_raise_limit(path, qualities):
    """
    Open a FASTA/FASTQ file for writing. If it fails because the number of open files
    would be exceeded, try to raise the soft limit and re-try.
    """
    try:
        f = dnaio.open(path, mode="w", qualities=qualities)
    except OSError as e:
        if e.errno == errno.EMFILE:  # Too many open files
            raise_open_files_limit(8)
            f = dnaio.open(path, mode="w", qualities=qualities)
        else:
            raise
    return f
示例#14
0
def generate_modified_fastq(read1_file,
                            read2_file,
                            cb_file,
                            read1_coords,
                            modified_read_file,
                            num_mismatches=1,
                            num_n_threshold=3):
    """Matches cell barcodes and generates modified fastq file."""

    cell_barcodes = [
        i.rstrip().split('-')[0] for i in open_by_suffix(cb_file, mode='r')
    ]

    cb_index = create_index(barcodes=cell_barcodes,
                            num_mismatches=num_mismatches)

    read_counter = [int(), int()]
    with dnaio.open(file1=read1_file,
                    file2=read2_file,
                    fileformat='fastq',
                    mode='r') as f, dnaio.open(file1=modified_read_file,
                                               fileformat='fastq',
                                               mode='w') as f_out:

        for rec in f:
            read_counter[1] += 1

            read1, read2 = rec
            reads = (read1.name, read1.sequence, read1.qualities,
                     read2.sequence, read2.qualities)
            out = match_cell_barcodes(reads=reads,
                                      barcode_index=cb_index,
                                      read_coords=read1_coords,
                                      num_mismatches=num_mismatches,
                                      num_n_threshold=num_n_threshold)
            if out:
                read_counter[0] += 1

                read_name, read1_seq, _, read2_seq, read2_qual, bc, dist = out
                read_info = '#'.join([read1_seq, bc, str(dist)])

                read_name = ' '.join(
                    [read_name.split(' ')[0], 'RI:Z:' + read_info])

                s2 = dnaio.Sequence(read_name, read2_seq, read2_qual)
                f_out.write(s2)

    return modified_read_file, read_counter
示例#15
0
def length_histogram(path):
    """Return a list of lengths """
    lengths = []
    with dnaio.open(path) as reader:
        for record in reader:
            lengths.append(len(record.sequence))
    return lengths
示例#16
0
    def get_sequence(read1_file, read2_file,
                     read1_coords=read1_coords, read2_coords=read2_coords):
        """Gets sequences."""

        with dnaio.open(file1=read1_file,
                        file2=read2_file,
                        fileformat='fastq',
                        mode='r') as f:

            for rec in f:
                read1, read2 = rec

                read1_seq = read1.sequence
                read2_seq = read2.sequence

                if read1_coords:
                    r1_start, r1_end = read1_coords
                    r1 = read1_seq[r1_start: min(r1_end, len(read1_seq))]
                else:
                    r1 = read1_seq

                if read2_coords:
                    r2_start, r2_end = read2_coords
                    r2 = read2_seq[r2_start: min(r2_end, len(read2_seq))]
                else:
                    r2 = read2_seq

                yield r1, r2, read1_seq, read2_seq
示例#17
0
def main():
    args = get_arguments()

    bc_dict = dict()
    with dnaio.open(args.fasta, mode="r") as file:
        for read in file:
            bc_id, bc_count, bc_seq = read.name.strip('>').split(':')
            bc_dict[bc_seq] = int(bc_count)

    # Based on https://umi-tools.readthedocs.io/en/latest/API.html
    clusterer = UMIClusterer(cluster_method='directional')

    start = time.time()
    clustered_bcs = clusterer(bc_dict, threshold=1)
    end = time.time()

    cluster_lens = [len(c) for c in clustered_bcs]
    count = Counter(cluster_lens)
    count = sorted(list(count.items()))
    print(f"Cluster size, Frequency")
    for bcs, frequency in count:
        print(f"{bcs:12}, {frequency:9}")

    print(f'Time to run: {end-start} s')
    print(f'Length data: {len(bc_dict)}')
示例#18
0
 def _open_writer(self, file, file2=None, force_fasta=None, **kwargs):
     # TODO backwards-incompatible change (?) would be to use outfiles.interleaved
     # for all outputs
     if force_fasta:
         kwargs['fileformat'] = 'fasta'
     return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities,
         **kwargs)
示例#19
0
    def build_index(self, size=None):
        """Builds an dictionary to index the reads.

        Returns (dict):
            If size is None, A dictionary, where,
            key: An identifier of the read pair, this is the first part of the identifier line up to the first space.
            value: A 2-tuple of read sequence.
        """
        fastq1_dict = dict()
        counter = 0
        with dnaio.open(self.r1, file2=self.r2) as fastq1:
            for read1, read2 in fastq1:
                read_pair = ReadPair(read1, read2)
                ident = read_pair.identifier
                fastq1_dict[ident] = (read_pair.read1.sequence,
                                      read_pair.read2.sequence)
                counter += 1
                if size and len(fastq1_dict.keys()) >= size:
                    print("%d reads indexed" % counter)
                    counter = 0
                    yield fastq1_dict
                    # Clear the dictionary so that it will take the next chunk.
                    fastq1_dict.clear()
                    gc.collect()
        print("%d reads indexed" % counter)
        yield fastq1_dict
示例#20
0
 def _read_fasta(path):
     records = []
     with dnaio.open(path) as sr:
         for record in sr:
             record.name = record.name.split(maxsplit=1)[0]
             records.append(record)
     return records
示例#21
0
def test_write(tmpdir, extension):
    s = dnaio.Sequence('name', 'ACGT', 'HHHH')
    out_fastq = tmpdir.join("out.fastq" + extension)
    with dnaio.open(str(out_fastq), mode='w') as f:
        f.write(s)
    with xopen(out_fastq) as f:
        assert f.read() == '@name\nACGT\n+\nHHHH\n'
示例#22
0
 def test_non_ascii_in_record(self):
     # \xc4 -> Ä
     fastq = BytesIO(b'@r1\n\xc4\n+\nH')
     with pytest.raises(FastqFormatError) as e:
         with dnaio.open(fastq) as f:
             list(f)
         e.match("Non-ASCII")
示例#23
0
def validate_fasta(path):
    """
    Ensure that the FASTA file is suitable for use with makeblastdb.
    Raise a FastaValidationError if any of the following are true:

    - a record is empty
    - a record name occurs more than once
    - a sequence occurs more than once
    """
    with dnaio.open(path) as sr:
        records = list(sr)

    names = set()
    sequences = dict()
    for r in records:
        if len(r.sequence) == 0:
            raise FastaValidationError("Record {!r} is empty".format(r.name))
        if r.name in names:
            raise FastaValidationError("Record name {!r} occurs more than once".format(r.name))
        s = r.sequence.upper()
        if s in sequences:
            raise FastaValidationError("Records {!r} and {!r} contain the same sequence".format(
                r.name, sequences[s]))
        sequences[s] = r.name
        names.add(r.name)
示例#24
0
def main(args):
    with dnaio.open(args.fasta) as fr:
        sequences = list(fr)
    logger.info('Plotting dendrogram of %s sequences', len(sequences))
    if args.mark:
        with dnaio.open(args.mark) as fr:
            mark = PrefixComparer(record.sequence for record in fr)
        labels = []
        n_new = 0
        for record in sequences:
            if record.sequence not in mark:
                extra = ' (new)'
                n_new += 1
            else:
                extra = ''
            labels.append(record.name + extra)
        logger.info('%s sequence(s) marked as "new"', n_new)
    else:
        labels = [s.name for s in sequences]

    import seaborn as sns
    import matplotlib.pyplot as plt
    sns.set()
    sns.set_style("white")
    font_size = 297 / 25.4 * 72 / (len(labels) + 5)
    font_size = min(16, max(6, font_size))
    height = font_size * (len(labels) + 5) / 72
    fig = plt.figure(figsize=(210 / 25.4, height))
    matplotlib.rcParams.update({'font.size': 4})
    ax = fig.gca()
    sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
    sns.set_style('whitegrid')
    if len(sequences) >= 2:
        m = distances([s.sequence for s in sequences])
        y = distance.squareform(m)
        mindist = int(y.min())
        logger.info('Smallest distance is %s. Found between:', mindist)
        for i,j in np.argwhere(m == y.min()):
            if i < j:
                logger.info('%s and %s', labels[i], labels[j])
        l = hierarchy.linkage(y, method=args.method)
        hierarchy.dendrogram(l, labels=labels, leaf_font_size=font_size, orientation='right', color_threshold=0.95*max(l[:,2]))
    else:
        ax.text(0.5, 0.5, 'no sequences', fontsize='xx-large')
    ax.grid(False)
    fig.set_tight_layout(True)
    fig.savefig(args.plot)
示例#25
0
def main(args):
    barcode_length = args.barcode_length
    too_short = 0
    n = 0
    sequences = defaultdict(
        list)  # maps sequences to a list of Sequence objects containing them
    with dnaio.open(args.fastx) as f:
        for record in islice(f, 0, args.limit):
            n += 1
            if len(record) < args.minimum_length:
                too_short += 1
                continue
            sequences[record.sequence].append(record)

    n_written = 0
    for records in sequences.values():
        # If there are multiple records with the same sequence, pick the first
        record = records[0]

        if barcode_length >= 0:
            barcode = record.sequence[:barcode_length]
            unbarcoded = record[barcode_length:]
        else:
            barcode = record.sequence[barcode_length:]
            unbarcoded = record[:barcode_length]

        if args.trim_g:
            # The RACE protocol leads to a run of non-template Gs in the beginning
            # of the sequence, after the barcode.
            unbarcoded.sequence = unbarcoded.sequence.lstrip('G')
            if unbarcoded.qualities:
                unbarcoded.qualities = unbarcoded.qualities[-len(unbarcoded.
                                                                 sequence):]

        name = record.name.split(maxsplit=1)[0]
        if name.endswith(';'):
            name = name[:-1]

        if barcode_length:
            print('>{};barcode={};size={};\n{}'.format(name, barcode,
                                                       len(records),
                                                       unbarcoded.sequence))
        else:
            print('>{};size={};\n{}'.format(name, len(records),
                                            unbarcoded.sequence))

        n_written += 1

    logger.info('%s sequences processed', n)
    logger.info('%s sequences long enough', n - too_short)
    logger.info('%s dereplicated sequences written', n_written)

    if args.json:
        stats = {
            'groups_written': n_written,
        }
        with open(args.json, 'w') as f:
            json.dump(stats, f, indent=2)
            print(file=f)
示例#26
0
 def _open_writer(self, file, file2, **kwargs):
     # TODO backwards-incompatible change (?) would be to use outfiles.interleaved
     # for all outputs
     return dnaio.open(file,
                       file2=file2,
                       mode='w',
                       qualities=self.uses_qualities,
                       **kwargs)
示例#27
0
def main(args):
    config = GlobalConfig()
    use_cache = config.use_cache
    if args.cache is not None:
        use_cache = args.cache
    if use_cache:
        global _igblastcache
        _igblastcache = IgBlastCache()
        logger.info('IgBLAST cache enabled')
    if args.threads == 0:
        args.threads = available_cpu_count()
    logger.info("Running IgBLAST on database sequences to find CDR/FR region locations")
    database = Database(args.database, args.sequence_type)
    logger.info("Running IgBLAST on input reads")
    detected_cdr3s = 0
    writer = TableWriter(sys.stdout)
    start_time = time.time()
    last_status_update = 0
    with ExitStack() as stack:
        if args.raw:
            raw_output = stack.enter_context(xopen(args.raw, 'w'))
        else:
            raw_output = None
        sequences = stack.enter_context(dnaio.open(args.fasta))
        sequences = islice(sequences, 0, args.limit)

        n = 0  # number of records processed so far
        for record in igblast(database, sequences, sequence_type=args.sequence_type,
                species=args.species, threads=args.threads, penalty=args.penalty,
                raw_output=raw_output, use_cache=use_cache):
            n += 1
            if args.rename is not None:
                record.query_name = "{}seq{}".format(args.rename, n)
            d = record.asdict()
            if d['CDR3_aa']:
                detected_cdr3s += 1
            try:
                writer.write(d)
            except IOError as e:
                if e.errno == errno.EPIPE:
                    sys.exit(1)
                raise
            if n % 1000 == 0:
                elapsed = time.time() - start_time
                if elapsed >= last_status_update + 60:
                    logger.info(
                        'Processed {:10,d} sequences at {:.3f} ms/sequence'.format(n, elapsed / n * 1E3))
                    last_status_update = elapsed
    elapsed = time.time() - start_time
    logger.info('Processed {:10,d} sequences at {:.1f} ms/sequence'.format(n, elapsed / n * 1E3))

    logger.info('%d IgBLAST assignments parsed and written', n)
    logger.info('CDR3s detected in %.1f%% of all sequences', detected_cdr3s / n * 100)
    if args.stats:
        stats = {'total': n, 'detected_cdr3s': detected_cdr3s}
        with open(args.stats, 'w') as f:
            json.dump(stats, f)
            print(file=f)
示例#28
0
def main(args):
    logger.info(f"Starting analysis")
    logger.info(f"Processing file: {args.err_corr}")

    if os.stat(args.err_corr).st_size == 0:
        logging.warning(f"File {args.err_corr} is empty.")

    err_corr = dict()
    clusters = set()
    with open(args.err_corr, "r") as file:
        for line in tqdm(file):
            try:
                cluster_seq, num_reads, raw_seqs_list = line.split()
            except ValueError:
                logging.warning(f"Non-default starcode output line: {line}")
                continue
            clusters.add(cluster_seq)
            for raw_seq in raw_seqs_list.split(","):
                if raw_seq not in err_corr:
                    err_corr[raw_seq] = cluster_seq

    logger.info(f"Clusters: {len(clusters)}")

    logger.info(f"Error corrected sequenced parsed.")

    logger.info(f"Correcting sequences and writing to output file.")

    counter = Counter()
    with dnaio.open(args.raw_fastq, mode="r", fileformat="fastq") as reader, \
            dnaio.open(args.corr_fasta, mode="w", fileformat="fasta") as openout:
        for read in tqdm(reader):

            counter['tot_reads'] += 1
            if read.sequence in err_corr:
                read.sequence = err_corr[read.sequence]

                openout.write(read)
                counter['corr_seqs'] += 1
            else:
                counter['no_err_corr_seq'] += 1

    logger.info(f"Reads total: {counter['tot_reads']:,}")
    logger.info(f"Reads corrected: {counter['corr_seqs']:,}")
    logger.info(f"Reads without corrected seq: {counter['no_err_corr_seq']:,}")
    logger.info(f"Finished")
示例#29
0
文件: qc.py 项目: jlduan/fba
    def _get_sequence(read_file):
        """Gets sequences."""

        with dnaio.open(file1=read_file,
                        file2=None,
                        fileformat='fastq',
                        mode='r') as f:

            for read in f:
                yield read.sequence, read.qualities
示例#30
0
 def _open_writer(
     self,
     file: BinaryIO,
     file2: Optional[BinaryIO] = None,
     force_fasta: Optional[bool] = None,
 ):
     assert file2 is None
     assert not isinstance(file, (str, bytes, Path))
     return dnaio.open(
         file, mode="w", qualities=self.uses_qualities, fileformat="fasta" if force_fasta else None)
示例#31
0
    def get_identifier(fastq_file):
        """Gets the identifier of the first read in a FASTQ file.

        Args:
            fastq_file: The full path of a FASTQ file.

        """
        with dnaio.open(fastq_file) as f:
            for read in f:
                return read.name.split(" ", 1)[0].split("/")[0]
示例#32
0
 def __call__(self, read, matches):
     """
     Write the read to the proper output file according to the most recent match
     """
     if matches:
         name = matches[-1].adapter.name
         if name not in self.writers:
             self.writers[name] = dnaio.open(self.template.replace('{name}', name),
                 mode='w', qualities=self.qualities)
         self.written += 1
         self.written_bp[0] += len(read)
         self.writers[name].write(read)
     else:
         if self.untrimmed_writer is None and self.untrimmed_path is not None:
             self.untrimmed_writer = dnaio.open(self.untrimmed_path,
                 mode='w', qualities=self.qualities)
         if self.untrimmed_writer is not None:
             self.written += 1
             self.written_bp[0] += len(read)
             self.untrimmed_writer.write(read)
     return DISCARD
示例#33
0
 def _open_writer(self, file, file2, **kwargs):
     # TODO backwards-incompatible change (?) would be to use outfiles.interleaved
     # for all outputs
     return dnaio.open(file, file2=file2, mode='w', qualities=self.uses_qualities,
         **kwargs)
示例#34
0
 def set_input(self, infiles: InputFiles):
     self._reader = dnaio.open(infiles.file1, file2=infiles.file2,
         interleaved=infiles.interleaved, mode='r')