Exemplo n.º 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-a", "--first-fastq-file", dest="fastq1", type=str,
        help="supply read1 fastq file")
    parser.add_argument(
        "-b", "--second-fastq-file", dest="fastq2", type=str,
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if unknown and len(unknown) == 2:
        args.fastq1, args.fastq2 = unknown

    fastq1 = iotools.open_file(args.fastq1)
    fastq2 = iotools.open_file(args.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            args.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 2
0
def filterReadsByPrimerMatch(infile, outfiles):
    '''Filter out reads where the start of read 1 does not match primer sequence (14bp)'''
    to_cluster = True
    primer = "a"
    if infile.find("_b.") > 0:
        primer = "b"
    if primer == "a":
        primer_seq = PARAMS["grep_primer_a"]
    else:
        primer_seq = PARAMS["grep_primer_b"]
    grep_filter_length = PARAMS["grep_filter_length"]
    primer_subseq = primer_seq[:grep_filter_length]

    track = P.snip(os.path.basename(infile), ".fastq.1.gz")
    infile2 = track + ".fastq.2.gz"
    outfile1, outfile2 = outfiles
    tempfile = "filtered/" + track + ".filtered.fastq.1.gz"

    # filter by primer match
    fastq_in = open(infile, "r")
    fastq_out = open(tempfile, "wb")
    for read in fq.iterate(fastq_in):
        if read.seq[:grep_filter_length] == primer_subseq:
            fastq_out.writeln("@" + read.id)
            fastq_out.writeln(read.seq)
            fastq_out.writeln("+")
            fastq_out.writeln(read.qual)
    fastq_in.close()
    fastq_out.close()

    # reconcile read pairs
    statement = '''python %(scriptsdir)s/fastqs2fastq.py --method=reconcile %(tempfile)s %(infile2)s --output-filename-pattern=filtered/%(track)s.reconciled.fastq.%%i.gz'''
    P.run()
def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(iotools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.items():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()
def buildExpectedCoverageOverGenomes(infiles, outfile):
    '''
    take sequence files and estimate the theoretical
    coverage we would get over genomes in the 
    sample i.e. at 1X coverage
    '''

    # if paired end then will have to multiply
    # by two
    multiply = False
    if infiles[0].endswith(".fastq.1.gz"):
        multiply = True

    # the theoretical coverage is defined as
    # (read length (L) * no. reads (N)) / genome size (G) (bp)

    # get genome sizes into memory
    genomes = open(infiles[1])
    header = genomes.readline()
    genome_sizes = {}
    for line in genomes.readlines():
        data = line[:-1].split("\t")
        gi = data[0].split("_")[1]
        size = data[1]
        genome_sizes[gi] = size

    # get the expected genome size
    expected_genome_sizes = collections.defaultdict(int)
    E.info("iterating over fastq file")
    for fastq in Fastq.iterate(iotools.openFile(infiles[0])):
        gi = fastq.identifier.split("|")[1]
        expected_genome_sizes[gi] += 1
    E.info("iterating over fastq file: DONE")

    # get the proportion of each genome covered
    outf = open(outfile, "w")
    outf.write("gi\texpected_coverage\n")
    for gi, size in expected_genome_sizes.items():
        if multiply:
            size = size * 2
        if gi not in genome_sizes:
            E.warn("could not find gi no. %s in dictionary" % gi)
            continue
        proportion_coverage = float(size) / float(genome_sizes[gi])
        if proportion_coverage > 1:
            proportion_coverage = 1
        outf.write("%s\t%f\n" % (gi, proportion_coverage))
    outf.close()
Exemplo n.º 5
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--to-drop-single", dest='to_remove_singletons')
    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-drop1", dest="fq_dropped1")

    (options, args) = E.start(parser)

    reads_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    reads_to_remove = set([x.strip() for x in reads_to_remove])

    fastq_out = IOTools.open_file(options.fq_out1, 'w')
    fastq_host = IOTools.open_file(options.fq_dropped1, 'w')

    reads = 0
    dropped_reads = 0
    for read in Fastq.iterate(IOTools.open_file(fastq1)):
        reads += 1
        if read.identifier.split()[0] in reads_to_remove:
            fastq_host.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))
            dropped_reads += 1
        else:
            fastq_out.write("@%s\n%s\n+\n%s\n" %
                            (read.identifier, read.seq, read.quals))

    fastq_out.close()
    fastq_host.close()

    try:
        percent_dropped = dropped_reads / float(reads) * 100
    except ZeroDivisionError:
        percent_dropped = 0.0

    E.info('Dropped %i of %i reads (%f percent)' \
           % (dropped_reads, reads, percent_dropped))
Exemplo n.º 6
0
def process_cgat(options):

    c = E.Counter()

    assert options.input_fastq_file == "-"

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "reverse-complement":
        for record in Fastq.iterate(options.stdin):
            record.seq = Genomics.complement(record.seq)
            record.quals = record.quals[::-1]
            options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = iotools.open_file(options.output_filename_pattern, "w")

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(iotools.open_file(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)
        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(iotools.read_list(iotools.open_file(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(iotools.open_file(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = iotools.open_file(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))
    return c
Exemplo n.º 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(method="join", )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(iotools.open_file(fn1))
        iter2 = Fastq.iterate(iotools.open_file(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            s2 = Genomics.reverse_complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in list(offsets.items())])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier, max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Exemplo n.º 8
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--fastq2", dest="fastq2")
    parser.add_option("--fastq3", dest="fastq3")

    parser.add_option("--to-drop-paired", dest='to_remove_paired')
    parser.add_option("--to-drop-single", dest='to_remove_singletons')

    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-out2", dest="fq_out2")
    parser.add_option("--fastq-out3", dest="fq_out3")

    parser.add_option("--fastq-drop1", dest="fq_dropped1")
    parser.add_option("--fastq-drop2", dest="fq_dropped2")
    parser.add_option("--fastq-drop3", dest="fq_dropped3")

    (options, args) = E.start(parser)

    # Fetch the reads to remove
    pairs_to_remove = IOTools.open_file(options.to_remove_paired).readlines()
    pairs_to_remove = set([x.strip() for x in pairs_to_remove])

    print(pairs_to_remove)

    singles_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    singles_to_remove = set([x.strip() for x in singles_to_remove])

    # open the outfiles
    fastq1_out = IOTools.open_file(options.fq_out1, 'w')
    fastq2_out = IOTools.open_file(options.fq_out2, 'w')
    fastq3_out = IOTools.open_file(options.fq_out3, 'w')

    fastq1_host = IOTools.open_file(options.fq_dropped1, 'w')
    fastq2_host = IOTools.open_file(options.fq_dropped2, 'w')
    fastq3_host = IOTools.open_file(options.fq_dropped3, 'w')

    dropped_pairs = 0
    pairs = 0
    # Drop the paired reads
    for read1, read2 in zip(Fastq.iterate(IOTools.open_file(options.fastq1)),
                            Fastq.iterate(IOTools.open_file(options.fastq2))):
        pairs += 1

        # bmtagger truncates fastq headers at space and won't accept
        # non-identical headers therefore, if one read matches, both
        # are chucked.
        r1_id = read1.identifier.split()[0]
        r2_id = read2.identifier.split()[0]

        print(r1_id)
        print(r2_id)

        if r1_id in pairs_to_remove or r2_id in pairs_to_remove:
            # Both are host
            fastq1_host.write("@%s\n%s\n+\n%s\n" %
                              (read1.identifier, read1.seq, read1.quals))
            fastq2_host.write("@%s\n%s\n+\n%s\n" %
                              (read2.identifier, read2.seq, read2.quals))
            dropped_pairs += 1
        else:
            # Neither are host
            fastq1_out.write("@%s\n%s\n+\n%s\n" %
                             (read1.identifier, read1.seq, read1.quals))
            fastq2_out.write("@%s\n%s\n+\n%s\n" %
                             (read2.identifier, read2.seq, read2.quals))
    # Drop singletons
    singletons = 0
    dropped_singletons = 0
    for read in Fastq.iterate(IOTools.open_file(options.fastq3)):
        singletons += 1
        if read.identifier.split()[0] in singles_to_remove:
            fastq3_host.write("@%s\n%s\n+\n%s\n" %
                              (read.identifier, read.seq, read.quals))
            dropped_singletons += 1
        else:
            fastq3_out.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))

    fastq1_out.close()
    fastq2_out.close()
    fastq3_out.close()
    fastq1_host.close()
    fastq2_host.close()
    fastq3_host.close()

    try:
        percent_pairs = dropped_pairs / float(pairs) * 100
    except ZeroDivisionError:
        percent_pairs = 0.0
    try:
        percent_singletons = dropped_singletons / float(singletons) * 100
    except ZeroDivisionError:
        percent_singletons = 0.0

    E.info('Dropped %i of %i read pairs (%f percent)' \
           % (dropped_pairs, pairs, percent_pairs))
    E.info('Dropped %i of %i singletons (%f percent)' \
           % (dropped_singletons, singletons, percent_singletons))