Пример #1
0
def combine_dual_indices(file1, file2):
    Seq = namedtuple('Seq', ['id', 'seq', 'qual', 'qual2'])
    for i1, i2 in zip_longest(fastqlite(file1), fastqlite(file2)):
        assert i1.id == i2.id
        yield Seq(id=i1.id,
                  seq=i1.seq + '+' + i2.seq,
                  qual=i1.qual,
                  qual2=i2.qual)
Пример #2
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'fastq',
        help='reads to count in fastq format',
        metavar='file.fastq[.bz2|.gz]',
        type=Opener(),
    )
    parser.add_argument(
        'read_counts',
        help='tabulate read counts and store as a CSV',
        metavar='FILE',
        type=argparse.FileType('w'),
    )
    args = parser.parse_args(arguments)
    count = sum(1 for _ in fastqlite(args.fastq))
    read_counts_writer = csv.writer(args.read_counts)
    read_counts_writer.writerow([args.fastq.name, count, count])
Пример #3
0
def main():
    args_parser = argparse.ArgumentParser(
        description=
        """Given a set of fasta / fastq files (min 2), combine them into one fasta file.
        Check at least to be sure no overlapping IDs. Optionally check for overlapping sequences too.
        """)

    args_parser.add_argument(
        'files',
        help='Files to be combined. Minimum of 2 required',
        nargs='+',
        type=fastalite.Opener(mode='r'))
    args_parser.add_argument('--fastq',
                             '-q',
                             help='input and outputs are fastq (not fasta)',
                             action='store_true')
    args_parser.add_argument(
        '--check-seq',
        '-s',
        help=
        'Also check to be sure sequences are not repeated. Default is to only check for repeated IDs',
        action='store_true')
    args_parser.add_argument(
        '--output',
        '-o',
        help='File into which we should place our combined reads',
        required=True,
        type=fastalite.Opener(mode='w'))

    args = args_parser.parse_args()
    logging.basicConfig(level=logging.INFO)

    if len(args.files) < 2:
        logging.error("Only one file given. Nothing to do.")
        return -1

    out_h = args.output

    if args.check_seq:
        seq_ids = set()
        seqs = set()
        for file_h in args.files:
            if args.fastq:
                reader = fastalite.fastqlite(file_h)
            else:
                reader = fastalite.fastalite(file_h)
            for sr in reader:
                if sr.id not in seq_ids and sr.seq not in seqs:
                    seq_ids.add(sr.id)
                    seqs.add(sr.seq)
                    if args.fastq:
                        out_h.write("@{} {}\n{}\n+\n{}\n".format(
                            sr.id, sr.description, sr.seq, sr.qual))
                    else:
                        out_h.write(">%s %s\n%s\n" %
                                    (sr.id, sr.description, sr.seq))

    else:  # just IDs
        seq_ids = set()
        for file_h in args.files:
            if args.fastq:
                reader = fastalite.fastqlite(file_h)
            else:
                reader = fastalite.fastalite(file_h)
            for sr in reader:
                if sr.id not in seq_ids:
                    seq_ids.add(sr.id)
                    if args.fastq:
                        out_h.write("@{} {}\n{}\n+\n{}\n".format(
                            sr.id, sr.description, sr.seq, sr.qual))
                    else:
                        out_h.write(">%s %s\n%s\n" %
                                    (sr.id, sr.description, sr.seq))
Пример #4
0
def main(arguments=None):
    parser = argparse.ArgumentParser(
        prog='barcodecop',
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'index',
        nargs='+',
        type=Opener(),
        metavar='file.fastq[.bz2|.gz]',
        help='one or two files containing index reads in fastq format')
    parser.add_argument('-f',
                        '--fastq',
                        type=Opener(),
                        metavar='file.fastq[.bz2|.gz]',
                        help='reads to filter in fastq format')
    parser.add_argument('-o',
                        '--outfile',
                        default=sys.stdout,
                        type=Opener('w'),
                        help='output fastq')
    parser.add_argument(
        '--snifflimit',
        type=int,
        default=10000,
        metavar='N',
        help='read no more than N records from the index file [%(default)s]')
    parser.add_argument('--head',
                        type=int,
                        metavar='N',
                        help='limit the output file to N records')
    parser.add_argument(
        '--min-pct-assignment',
        type=float,
        default=90.0,
        metavar='PERCENT',
        help=("""warn (or fail with an error; see --strict) if the
               most common barcode represents less than PERCENT of the
               total [%(default)s]"""))
    parser.add_argument(
        '--strict',
        action='store_true',
        default=False,
        help=("""fail if conditions of --min-pct-assignment are not met"""))
    parser.add_argument(
        '--invert',
        action='store_true',
        default=False,
        help='include only sequences *not* matching the most common barcode')
    # parser.add_argument('--format', choices=['fasta', 'fastq'], default='fastq')
    parser.add_argument('-c',
                        '--show-counts',
                        action='store_true',
                        default=False,
                        help='tabulate barcode counts and exit')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        default=False,
                        help='minimize messages to stderr')
    parser.add_argument('-V',
                        '--version',
                        action=VersionAction,
                        version=__version__,
                        help='Print the version number and exit')

    args = parser.parse_args(arguments)

    logging.basicConfig(format='%(message)s',
                        level=logging.ERROR if args.quiet else logging.INFO)
    log = logging.getLogger(__name__)

    if len(args.index) == 1:
        bcseqs = fastqlite(args.index[0])
    elif len(args.index) == 2:
        bcseqs = combine_dual_indices(*args.index)
    else:
        log.error('error: please specify either one or two index files')

    bc1, bc2 = tee(bcseqs, 2)

    # determine the most common barcode
    barcode_counts = Counter(
        [str(seq.seq) for seq in islice(bc1, args.snifflimit)])
    barcodes, counts = zip(*barcode_counts.most_common())

    most_common_bc = barcodes[0]
    most_common_pct = 100 * float(counts[0]) / sum(counts)
    log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format(
        most_common_bc, counts[0], sum(counts), most_common_pct))

    if args.show_counts:
        for bc, count in barcode_counts.most_common():
            print('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc), count))
        return None

    if most_common_pct < args.min_pct_assignment:
        msg = 'frequency of most common barcode is less than {}%'.format(
            args.min_pct_assignment)
        if args.strict:
            log.error('Error: ' + msg)
            sys.exit(1)
        else:
            log.warning('Warning: ' + msg)

    if not args.fastq:
        log.error('specify a fastq format file to filter using -f/--fastq')
        sys.exit(1)

    seqs = fastqlite(args.fastq)
    filtered = islice(filter(bc2, seqs, most_common_bc, args.invert),
                      args.head)

    for seq in filtered:
        args.outfile.write(as_fastq(seq))
Пример #5
0
def combine_dual_indices(file1, file2):
    Seq = namedtuple('Seq', ['id', 'seq'])
    for i1, i2 in izip(fastqlite(file1), fastqlite(file2)):
        assert i1.id == i2.id
        yield Seq(id=i1.id, seq=i1.seq + '+' + i2.seq)
Пример #6
0
def main(arguments=None):
    parser = argparse.ArgumentParser(
        prog='barcodecop',
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'index',
        nargs='+',
        type=Opener(),
        metavar='file.fastq[.bz2|.gz]',
        help='one or two files containing index reads in fastq format')
    parser.add_argument('-f',
                        '--fastq',
                        type=Opener(),
                        metavar='file.fastq[.bz2|.gz]',
                        help='reads to filter in fastq format')
    parser.add_argument('-o',
                        '--outfile',
                        default=sys.stdout,
                        type=Opener('w'),
                        help='output fastq')
    parser.add_argument(
        '--snifflimit',
        type=int,
        default=10000,
        metavar='N',
        help='read no more than N records from the index file [%(default)s]')
    parser.add_argument('--head',
                        type=int,
                        metavar='N',
                        help='limit the output file to N records')
    parser.add_argument(
        '--invert',
        action='store_true',
        default=False,
        help='include only sequences failing filtering criteria')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        default=False,
                        help='minimize messages to stderr')
    parser.add_argument('-V',
                        '--version',
                        action=VersionAction,
                        version=__version__,
                        help='Print the version number and exit')

    match_options = parser.add_argument_group('Barcode matching options')

    match_options.add_argument(
        '--match-filter',
        action='store_true',
        default=False,
        help=('filter reads based on exact match to most common barcode '
              '[default: no match filter]'))
    match_options.add_argument(
        '--min-pct-assignment',
        type=float,
        default=90.0,
        metavar='PERCENT',
        help=("""warn (or fail with an error; see --strict) if the
               most common barcode represents less than PERCENT of the
               total [%(default)s]"""))
    match_options.add_argument(
        '--strict',
        action='store_true',
        default=False,
        help=("""fail if conditions of --min-pct-assignment are not met"""))
    match_options.add_argument('-c',
                               '--show-counts',
                               action='store_true',
                               default=False,
                               help='tabulate barcode counts and exit')

    qual_options = parser.add_argument_group(
        'Barcode quality filtering options')

    qual_options.add_argument(
        '--qual-filter',
        action='store_true',
        default=False,
        help=
        'filter reads based on minimum index quality [default: no quality filter]'
    )
    qual_options.add_argument(
        '-p',
        '--min-qual',
        type=int,
        default=MIN_QUAL,
        help="""reject seqs with mean barcode quality score less than
        this value; for dual index, both barcodes must meet the
        threshold [%(default)s]""")
    qual_options.add_argument('--encoding',
                              default='phred',
                              choices=['phred'],
                              help="""quality score encoding; see
             https://en.wikipedia.org/wiki/FASTQ_format [%(default)s]""")

    args = parser.parse_args(arguments)

    logging.basicConfig(format='%(message)s',
                        level=logging.ERROR if args.quiet else logging.INFO)
    log = logging.getLogger(__name__)

    # when provided with dual barcodes, concatenate into a single
    # namedtuple with attributes qual and qual1; generate a filter
    # function appropriate for either case.
    if len(args.index) == 1:
        bcseqs = fastqlite(args.index[0])
        qual_filter = get_qual_filter(args.min_qual, args.encoding)
    elif len(args.index) == 2:
        qual_filter = get_qual_filter(args.min_qual,
                                      args.encoding,
                                      paired=True)
        bcseqs = combine_dual_indices(*args.index)
    else:
        log.error('error: please specify either one or two index files')

    # use bc1 to determine most common barcode
    bc1, bc2 = tee(bcseqs, 2)

    # determine the most common barcode
    barcode_counts = Counter(
        [str(seq.seq) for seq in islice(bc1, args.snifflimit)])
    barcodes, counts = list(zip(*barcode_counts.most_common()))

    most_common_bc = barcodes[0]
    most_common_pct = 100 * float(counts[0]) / sum(counts)
    log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format(
        most_common_bc, counts[0], sum(counts), most_common_pct))

    if args.show_counts:
        for bc, count in barcode_counts.most_common():
            print(('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc),
                                       count)))
        return None

    if most_common_pct < args.min_pct_assignment:
        msg = 'frequency of most common barcode is less than {}%'.format(
            args.min_pct_assignment)
        if args.strict:
            log.error('Error: ' + msg)
            sys.exit(1)
        else:
            log.warning('Warning: ' + msg)

    if not args.fastq:
        log.error('specify a fastq format file to filter using -f/--fastq')
        sys.exit(1)

    ifilterfun = filterfalse if args.invert else filter

    seqs = fastqlite(args.fastq)
    filtered = zip_longest(seqs, bc2)

    if args.match_filter:
        filtered = ifilterfun(get_match_filter(most_common_bc), filtered)

    if args.qual_filter:
        filtered = ifilterfun(qual_filter, filtered)

    for seq, bc in islice(filtered, args.head):
        assert seq.id == bc.id
        args.outfile.write(as_fastq(seq))
Пример #7
0
def main():
    args_parser = argparse.ArgumentParser(
        description="""Given set(s) of paired reads in fastq format
        combine all into one pair of reads also in fastq format.
        Concurrently confirm all R1 have matched R2.
        """)

    args_parser.add_argument('--in-1',
                             '-1',
                             help='Read 1 Files to be combined.',
                             nargs='+',
                             required=True,
                             type=fastalite.Opener(mode='r'))
    args_parser.add_argument(
        '--in-2',
        '-2',
        help="""Read 2 Files to be combined. Must be in same order as --in-1""",
        nargs='+',
        required=True,
        type=fastalite.Opener(mode='r'))

    args_parser.add_argument(
        '--out-1',
        '-o1',
        help='File into which we should place our combined R1',
        required=True,
        type=fastalite.Opener(mode='w'))
    args_parser.add_argument(
        '--out-2',
        '-o2',
        help='File into which we should place our combined R2',
        required=True,
        type=fastalite.Opener(mode='w'))

    args_parser.add_argument(
        '--normalize-ids',
        '-ni',
        help='Normalize IDs for pairs by stripping /x from the end',
        action='store_true')

    args = args_parser.parse_args()
    logging.basicConfig(level=logging.INFO)

    assert len(args.in_1) == len(
        args.in_2), "Mismatched number of forward and reverse read files."

    # Loop 1: Identify ALL R1 and R2 IDs in all files.
    # Also look for duplicated IDs
    IDs_R1 = set()
    IDs_R2 = set()
    logging.info("Looping through files to identify all sequence IDs")
    for r1_h, r2_h in zip(args.in_1, args.in_2):
        # Will not use list comprehension to allow for error handling...
        file_ids_r1 = set()
        r1_reader = fastalite.fastqlite(r1_h)
        try:
            while True:
                try:
                    sr = next(r1_reader)
                    file_ids_r1.add(get_seq_id(sr.id, args.normalize_ids))
                except ValueError:
                    pass
        except StopIteration:
            pass

        file_ids_r2 = set()
        r2_reader = fastalite.fastqlite(r2_h)
        try:
            while True:
                try:
                    sr = next(r2_reader)
                    file_ids_r2.add(get_seq_id(sr.id, args.normalize_ids))
                except ValueError:
                    pass
        except StopIteration:
            pass

        if len(IDs_R1.intersection(file_ids_r1)) > 0:
            logging.warning(
                "{:,} of {:,} R1 read IDs from this file overlap with others".
                format(len(IDs_R1.intersection(file_ids_r1)),
                       len(file_ids_r1)))
        if len(IDs_R2.intersection(file_ids_r2)) > 0:
            logging.warning(
                "{:,} of {:,} R2 read IDs from this file overlap with others".
                format(len(IDs_R2.intersection(file_ids_r2)),
                       len(file_ids_r2)))
        IDs_R1.update(file_ids_r1)
        IDs_R2.update(file_ids_r2)
        r1_h.seek(0)
        r2_h.seek(0)

    overlapped_ids = IDs_R1.intersection(IDs_R2)
    starting_num_ids = len(overlapped_ids)
    logging.info(
        "There are {:,} overlapping IDs from {:,} forward read IDs and {:,} reverse read IDs"
        .format(starting_num_ids, len(IDs_R1), len(IDs_R2)))

    for r1_h, r2_h in zip(args.in_1, args.in_2):
        srs_r1 = fastalite.fastqlite(r1_h)
        srs_r2 = fastalite.fastqlite(r2_h)
        sr_1 = None
        sr_2 = None
        try:
            while sr_1 is None:
                try:
                    sr_1 = next(srs_r1)
                except ValueError:
                    sr_1 = None
            while sr_2 is None:
                try:
                    sr_2 = next(srs_r2)
                except ValueError:
                    sr_2 = None
            while len(overlapped_ids) > 0:
                if len(overlapped_ids) % 10000 == 0:
                    logging.info("{:,} of {:,} pairs remaining".format(
                        len(overlapped_ids),
                        starting_num_ids,
                    ))
                while (sr_1 is None) or (get_seq_id(
                        sr_1.id, args.normalize_ids) not in overlapped_ids):
                    try:
                        sr_1 = next(srs_r1)
                    except ValueError:
                        sr_1 = None
                while (sr_2 is None) or (get_seq_id(
                        sr_2.id, args.normalize_ids) not in overlapped_ids):
                    try:
                        sr_2 = next(srs_r2)
                    except ValueError:
                        sr_2 = None
                assert get_seq_id(sr_1.id, args.normalize_ids) == get_seq_id(
                    sr_2.id, args.normalize_ids), "Order off of reads"
                # Implicit else paired and shared.
                # Remove it from the target list (takes care of duplicates)
                overlapped_ids.remove(get_seq_id(sr_1.id, args.normalize_ids))
                # Write out pair...
                write_fastq(sr_1, args.out_1)
                write_fastq(sr_2, args.out_2)
                # move to next
                try:
                    sr_1 = next(srs_r1)
                except ValueError:
                    pass
                try:
                    sr_2 = next(srs_r2)
                except ValueError:
                    pass
        except StopIteration:
            pass